sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/uio.h>
  68 #include <sys/kernel.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/mount.h>
  71 #include <sys/vnode.h>
  72 #include <sys/malloc.h>
  73 #include <sys/sysmsg.h>
  74 #include <sys/spinlock.h>
  75 #include <sys/proc.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/spinlock2.h>
  85
  86 #define MAX_RECURSION_DEPTH     64
  87
  88 /*
  89  * Random lookups in the cache are accomplished with a hash table using
  90  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock,
  91  * but we use the ncp->update counter trick to avoid acquiring any
  92  * contestable spin-locks during a lookup.
  93  *
  94  * Negative entries may exist and correspond to resolved namecache
  95  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  96  * will be set if the entry corresponds to a whited-out directory entry
  97  * (verses simply not finding the entry at all).  pcpu_ncache[n].neg_list
  98  * is locked via pcpu_ncache[n].neg_spin;
  99  *
 100  * MPSAFE RULES:
 101  *
 102  * (1) ncp's typically have at least a nc_refs of 1, and usually 2.  One
 103  *     is applicable to direct lookups via the hash table nchpp or via
 104  *     nc_list (the two are added or removed together).  Removal of the ncp
 105  *     from the hash table drops this reference.  The second is applicable
 106  *     to vp->v_namecache linkages (or negative list linkages), and removal
 107  *     of the ncp from these lists drops this reference.
 108  *
 109  *     On the 1->0 transition of nc_refs the ncp can no longer be referenced
 110  *     and must be destroyed.  No other thread should have access to it at
 111  *     this point so it can be safely locked and freed without any deadlock
 112  *     fears.
 113  *
 114  *     The 1->0 transition can occur at almost any juncture and so cache_drop()
 115  *     deals with it directly.
 116  *
 117  * (2) Once the 1->0 transition occurs, the entity that caused the transition
 118  *     will be responsible for destroying the ncp.  The ncp cannot be on any
 119  *     list or hash at this time, or be held by anyone other than the caller
 120  *     responsible for the transition.
 121  *
 122  * (3) A ncp must be locked in order to modify it.
 123  *
 124  * (5) ncp locks are ordered, child-to-parent.  Child first, then parent.
 125  *     This may seem backwards but forward-scans use the hash table and thus
 126  *     can hold the parent unlocked while traversing downward.  Deletions,
 127  *     on the other-hand, tend to propagate bottom-up since the ref on the
 128  *     is dropped as the children go away.
 129  *
 130  * (6) Both parent and child must be locked in order to enter the child onto
 131  *     the parent's nc_list.
 132  */
 133
 134 /*
 135  * Structures associated with name cacheing.
 136  */
 137 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 138 #define MINNEG                  1024
 139 #define MINPOS                  1024
 140 #define NCMOUNT_NUMCACHE        (16384) /* power of 2 */
 141 #define NCMOUNT_SET             (8)     /* power of 2 */
 142
 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache),
 144                   "namecache", "namecache entries");
 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings");
 146
 147 TAILQ_HEAD(nchash_list, namecache);
 148
 149 /*
 150  * Don't cachealign, but at least pad to 32 bytes so entries
 151  * don't cross a cache line.
 152  */
 153 struct nchash_head {
 154        struct nchash_list list; /* 16 bytes */
 155        struct spinlock  spin;   /* 8 bytes */
 156        long     pad01;          /* 8 bytes */
 157 };
 158
 159 struct ncmount_cache {
 160         struct spinlock spin;
 161         struct namecache *ncp;
 162         struct mount *mp;
 163         struct mount *mp_target;
 164         int isneg;
 165         int ticks;
 166         int updating;
 167         int unused01;
 168 };
 169
 170 struct pcpu_ncache {
 171         struct spinlock         umount_spin;    /* cache_findmount/interlock */
 172         struct spinlock         neg_spin;       /* for neg_list and neg_count */
 173         struct namecache_list   neg_list;
 174         long                    neg_count;
 175         long                    vfscache_negs;
 176         long                    vfscache_count;
 177         long                    vfscache_leafs;
 178         long                    vfscache_unres;
 179         long                    numdefered;
 180         long                    inv_kid_quick_count;
 181         long                    inv_ncp_quick_count;
 182         long                    clean_pos_count;
 183         long                    clean_neg_count;
 184 } __cachealign;
 185
 186 __read_mostly static struct nchash_head *nchashtbl;
 187 __read_mostly static struct pcpu_ncache *pcpu_ncache;
 188 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 189
 190 /*
 191  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 192  * to create the namecache infrastructure leading to a dangling vnode.
 193  *
 194  * 0    Only errors are reported
 195  * 1    Successes are reported
 196  * 2    Successes + the whole directory scan is reported
 197  * 3    Force the directory scan code run as if the parent vnode did not
 198  *      have a namecache record, even if it does have one.
 199  */
 200 __read_mostly int       ncvp_debug;
 201 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 202     "Namecache debug level (0-3)");
 203
 204 __read_mostly static u_long nchash;             /* size of hash table */
 205 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 206     "Size of namecache hash table");
 207
 208 __read_mostly static int ncnegflush = 10;       /* burst for negative flush */
 209 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 210     "Batch flush negative entries");
 211
 212 __read_mostly static int ncposflush = 10;       /* burst for positive flush */
 213 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 214     "Batch flush positive entries");
 215
 216 __read_mostly static int ncnegfactor = 16;      /* ratio of negative entries */
 217 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 218     "Ratio of negative namecache entries");
 219
 220 __read_mostly static int ncposfactor = 16;    /* ratio of unres+leaf entries */
 221 SYSCTL_INT(_debug, OID_AUTO, ncposfactor, CTLFLAG_RW, &ncposfactor, 0,
 222     "Ratio of unresolved leaf namecache entries");
 223
 224 __read_mostly static int nclockwarn;    /* warn on locked entries in ticks */
 225 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 226     "Warn on locked namecache entries in ticks");
 227
 228 __read_mostly static int ncposlimit;    /* number of cache entries allocated */
 229 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 230     "Number of cache entries allocated");
 231
 232 __read_mostly static int ncp_shared_lock_disable = 0;
 233 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 234            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 235
 236 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 237     "sizeof(struct vnode)");
 238 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 239     "sizeof(struct namecache)");
 240
 241 __read_mostly static int ncmount_cache_enable = 1;
 242 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 243            &ncmount_cache_enable, 0, "mount point cache");
 244
 245 static __inline void _cache_drop(struct namecache *ncp);
 246 static int cache_resolve_mp(struct mount *mp, int adjgen);
 247 static int cache_findmount_callback(struct mount *mp, void *data);
 248 static void _cache_setunresolved(struct namecache *ncp, int adjgen);
 249 static void _cache_cleanneg(long count);
 250 static void _cache_cleanpos(long ucount, long xcount);
 251 static void _cache_cleandefered(void);
 252 static void _cache_unlink(struct namecache *ncp);
 253
 254 /*
 255  * The new name cache statistics (these are rolled up globals and not
 256  * modified in the critical path, see struct pcpu_ncache).
 257  */
 258 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 259 static long vfscache_negs;
 260 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
 261     "Number of negative namecache entries");
 262 static long vfscache_count;
 263 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
 264     "Number of namecaches entries");
 265 static long vfscache_leafs;
 266 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
 267     "Number of leaf namecaches entries");
 268 static long vfscache_unres;
 269 SYSCTL_LONG(_vfs_cache, OID_AUTO, numunres, CTLFLAG_RD, &vfscache_unres, 0,
 270     "Number of unresolved leaf namecaches entries");
 271
 272 static long     inv_kid_quick_count;
 273 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_kid_quick_count, CTLFLAG_RD,
 274             &inv_kid_quick_count, 0,
 275             "quick kid invalidations");
 276 static long     inv_ncp_quick_count;
 277 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_ncp_quick_count, CTLFLAG_RD,
 278             &inv_ncp_quick_count, 0,
 279             "quick ncp invalidations");
 280 static long     clean_pos_count;
 281 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_pos_count, CTLFLAG_RD,
 282             &clean_pos_count, 0,
 283             "positive ncp cleanings");
 284 static long     clean_neg_count;
 285 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_neg_count, CTLFLAG_RD,
 286             &clean_neg_count, 0,
 287             "negative ncp cleanings");
 288
 289 static long     numdefered;
 290 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 291     "Number of cache entries allocated");
 292
 293 /*
 294  * Returns the number of basic references expected on the ncp, not
 295  * including any children.  1 for the natural ref, and an addition ref
 296  * if the ncp is resolved (representing a positive or negative hit).
 297  */
 298 static __inline int
 299 ncpbaserefs(struct namecache *ncp)
 300 {
 301         return (1 + ((ncp->nc_flag & NCF_UNRESOLVED) == 0));
 302 }
 303
 304 struct nchstats nchstats[SMP_MAXCPU];
 305 /*
 306  * Export VFS cache effectiveness statistics to user-land.
 307  *
 308  * The statistics are left for aggregation to user-land so
 309  * neat things can be achieved, like observing per-CPU cache
 310  * distribution.
 311  */
 312 static int
 313 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 314 {
 315         struct globaldata *gd;
 316         int i, error;
 317
 318         error = 0;
 319         for (i = 0; i < ncpus; ++i) {
 320                 gd = globaldata_find(i);
 321                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 322                         sizeof(struct nchstats))))
 323                         break;
 324         }
 325
 326         return (error);
 327 }
 328 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 329   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 330
 331 static int cache_zap(struct namecache *ncp);
 332
 333 /*
 334  * Cache mount points and namecache records in order to avoid unnecessary
 335  * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
 336  * performance and is particularly important on multi-socket systems to
 337  * reduce cache-line ping-ponging.
 338  *
 339  * Try to keep the pcpu structure within one cache line (~64 bytes).
 340  */
 341 #define MNTCACHE_COUNT  32      /* power of 2, multiple of SET */
 342 #define MNTCACHE_SET    8       /* set associativity */
 343
 344 struct mntcache_elm {
 345         struct namecache *ncp;
 346         struct mount     *mp;
 347         int     ticks;
 348         int     unused01;
 349 };
 350
 351 struct mntcache {
 352         struct mntcache_elm array[MNTCACHE_COUNT];
 353 } __cachealign;
 354
 355 static struct mntcache  pcpu_mntcache[MAXCPU];
 356
 357 static __inline
 358 void
 359 _cache_ncp_gen_enter(struct namecache *ncp)
 360 {
 361         ncp->nc_generation += 2;
 362         cpu_sfence();
 363 }
 364
 365 static __inline
 366 void
 367 _cache_ncp_gen_exit(struct namecache *ncp)
 368 {
 369         cpu_sfence();
 370         ncp->nc_generation += 2;
 371         cpu_sfence();
 372 }
 373
 374 static __inline
 375 struct mntcache_elm *
 376 _cache_mntcache_hash(void *ptr)
 377 {
 378         struct mntcache_elm *elm;
 379         int hv;
 380
 381         hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1);
 382         elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)];
 383
 384         return elm;
 385 }
 386
 387 static
 388 void
 389 _cache_mntref(struct mount *mp)
 390 {
 391         struct mntcache_elm *elm;
 392         struct mount *mpr;
 393         int i;
 394
 395         elm = _cache_mntcache_hash(mp);
 396         for (i = 0; i < MNTCACHE_SET; ++i) {
 397                 if (elm->mp == mp) {
 398                         mpr = atomic_swap_ptr((void *)&elm->mp, NULL);
 399                         if (__predict_true(mpr == mp))
 400                                 return;
 401                         if (mpr)
 402                                 atomic_add_int(&mpr->mnt_refs, -1);
 403                 }
 404                 ++elm;
 405         }
 406         atomic_add_int(&mp->mnt_refs, 1);
 407 }
 408
 409 static
 410 void
 411 _cache_mntrel(struct mount *mp)
 412 {
 413         struct mntcache_elm *elm;
 414         struct mntcache_elm *best;
 415         struct mount *mpr;
 416         int delta1;
 417         int delta2;
 418         int i;
 419
 420         elm = _cache_mntcache_hash(mp);
 421         best = elm;
 422         for (i = 0; i < MNTCACHE_SET; ++i) {
 423                 if (elm->mp == NULL) {
 424                         mpr = atomic_swap_ptr((void *)&elm->mp, mp);
 425                         if (__predict_false(mpr != NULL)) {
 426                                 atomic_add_int(&mpr->mnt_refs, -1);
 427                         }
 428                         elm->ticks = ticks;
 429                         return;
 430                 }
 431                 delta1 = ticks - best->ticks;
 432                 delta2 = ticks - elm->ticks;
 433                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
 434                         best = elm;
 435                 ++elm;
 436         }
 437         mpr = atomic_swap_ptr((void *)&best->mp, mp);
 438         best->ticks = ticks;
 439         if (mpr)
 440                 atomic_add_int(&mpr->mnt_refs, -1);
 441 }
 442
 443 /*
 444  * Clears all cached mount points on all cpus.  This routine should only
 445  * be called when we are waiting for a mount to clear, e.g. so we can
 446  * unmount.
 447  */
 448 void
 449 cache_clearmntcache(struct mount *target __unused)
 450 {
 451         int n;
 452
 453         for (n = 0; n < ncpus; ++n) {
 454                 struct mntcache *cache = &pcpu_mntcache[n];
 455                 struct mntcache_elm *elm;
 456                 struct namecache *ncp;
 457                 struct mount *mp;
 458                 int i;
 459
 460                 for (i = 0; i < MNTCACHE_COUNT; ++i) {
 461                         elm = &cache->array[i];
 462                         if (elm->mp) {
 463                                 mp = atomic_swap_ptr((void *)&elm->mp, NULL);
 464                                 if (mp)
 465                                         atomic_add_int(&mp->mnt_refs, -1);
 466                         }
 467                         if (elm->ncp) {
 468                                 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL);
 469                                 if (ncp)
 470                                         _cache_drop(ncp);
 471                         }
 472                 }
 473         }
 474 }
 475
 476 /*
 477  * Namespace locking.  The caller must already hold a reference to the
 478  * namecache structure in order to lock/unlock it.  The controlling entity
 479  * in a 1->0 transition does not need to lock the ncp to dispose of it,
 480  * as nobody else will have visibility to it at that point.
 481  *
 482  * Note that holding a locked namecache structure prevents other threads
 483  * from making namespace changes (e.g. deleting or creating), prevents
 484  * vnode association state changes by other threads, and prevents the
 485  * namecache entry from being resolved or unresolved by other threads.
 486  *
 487  * An exclusive lock owner has full authority to associate/disassociate
 488  * vnodes and resolve/unresolve the locked ncp.
 489  *
 490  * A shared lock owner only has authority to acquire the underlying vnode,
 491  * if any.
 492  *
 493  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 494  * fact (when locking) or cleared prior to unlocking.
 495  *
 496  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 497  *           or recycled, but it does NOT help you if the vnode had already
 498  *           initiated a recyclement.  If this is important, use cache_get()
 499  *           rather then cache_lock() (and deal with the differences in the
 500  *           way the refs counter is handled).  Or, alternatively, make an
 501  *           unconditional call to cache_validate() or cache_resolve()
 502  *           after cache_lock() returns.
 503  */
 504 static __inline
 505 void
 506 _cache_lock(struct namecache *ncp)
 507 {
 508         int didwarn = 0;
 509         int error;
 510
 511         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 512         while (__predict_false(error == EWOULDBLOCK)) {
 513                 if (didwarn == 0) {
 514                         didwarn = ticks - nclockwarn;
 515                         kprintf("[diagnostic] cache_lock: "
 516                                 "%s blocked on %p "
 517                                 "\"%*.*s\"\n",
 518                                 curthread->td_comm, ncp,
 519                                 ncp->nc_nlen, ncp->nc_nlen,
 520                                 ncp->nc_name);
 521                 }
 522                 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK);
 523         }
 524         if (__predict_false(didwarn)) {
 525                 kprintf("[diagnostic] cache_lock: "
 526                         "%s unblocked %*.*s after %d secs\n",
 527                         curthread->td_comm,
 528                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 529                         (int)(ticks - didwarn) / hz);
 530         }
 531 }
 532
 533 /*
 534  * Release a previously acquired lock.
 535  *
 536  * A concurrent shared-lock acquisition or acquisition/release can
 537  * race bit 31 so only drop the ncp if bit 31 was set.
 538  */
 539 static __inline
 540 void
 541 _cache_unlock(struct namecache *ncp)
 542 {
 543         lockmgr(&ncp->nc_lock, LK_RELEASE);
 544 }
 545
 546 /*
 547  * Lock ncp exclusively, non-blocking.  Return 0 on success.
 548  */
 549 static __inline
 550 int
 551 _cache_lock_nonblock(struct namecache *ncp)
 552 {
 553         int error;
 554
 555         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT);
 556         if (__predict_false(error != 0)) {
 557                 return(EWOULDBLOCK);
 558         }
 559         return 0;
 560 }
 561
 562 /*
 563  * This is a special form of _cache_lock() which only succeeds if
 564  * it can get a pristine, non-recursive lock.  The caller must have
 565  * already ref'd the ncp.
 566  *
 567  * On success the ncp will be locked, on failure it will not.  The
 568  * ref count does not change either way.
 569  *
 570  * We want _cache_lock_special() (on success) to return a definitively
 571  * usable vnode or a definitively unresolved ncp.
 572  */
 573 static __inline
 574 int
 575 _cache_lock_special(struct namecache *ncp)
 576 {
 577         if (_cache_lock_nonblock(ncp) == 0) {
 578                 if (lockmgr_oneexcl(&ncp->nc_lock)) {
 579                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 580                                 _cache_setunresolved(ncp, 1);
 581                         return 0;
 582                 }
 583                 _cache_unlock(ncp);
 584         }
 585         return EWOULDBLOCK;
 586 }
 587
 588 /*
 589  * Shared lock, guarantees vp held
 590  *
 591  * The shared lock holds vp on the 0->1 transition.  It is possible to race
 592  * another shared lock release, preventing the other release from dropping
 593  * the vnode and clearing bit 31.
 594  *
 595  * If it is not set then we are responsible for setting it, and this
 596  * responsibility does not race with anyone else.
 597  */
 598 static __inline
 599 void
 600 _cache_lock_shared(struct namecache *ncp)
 601 {
 602         int didwarn = 0;
 603         int error;
 604
 605         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 606         while (__predict_false(error == EWOULDBLOCK)) {
 607                 if (didwarn == 0) {
 608                         didwarn = ticks - nclockwarn;
 609                         kprintf("[diagnostic] cache_lock_shared: "
 610                                 "%s blocked on %p "
 611                                 "\"%*.*s\"\n",
 612                                 curthread->td_comm, ncp,
 613                                 ncp->nc_nlen, ncp->nc_nlen,
 614                                 ncp->nc_name);
 615                 }
 616                 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 617         }
 618         if (__predict_false(didwarn)) {
 619                 kprintf("[diagnostic] cache_lock_shared: "
 620                         "%s unblocked %*.*s after %d secs\n",
 621                         curthread->td_comm,
 622                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 623                         (int)(ticks - didwarn) / hz);
 624         }
 625 }
 626
 627 /*
 628  * Shared lock, guarantees vp held.  Non-blocking.  Returns 0 on success
 629  */
 630 static __inline
 631 int
 632 _cache_lock_shared_nonblock(struct namecache *ncp)
 633 {
 634         int error;
 635
 636         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT);
 637         if (__predict_false(error != 0)) {
 638                 return(EWOULDBLOCK);
 639         }
 640         return 0;
 641 }
 642
 643 /*
 644  * This function tries to get a shared lock but will back-off to an
 645  * exclusive lock if:
 646  *
 647  * (1) Some other thread is trying to obtain an exclusive lock
 648  *     (to prevent the exclusive requester from getting livelocked out
 649  *     by many shared locks).
 650  *
 651  * (2) The current thread already owns an exclusive lock (to avoid
 652  *     deadlocking).
 653  *
 654  * WARNING! On machines with lots of cores we really want to try hard to
 655  *          get a shared lock or concurrent path lookups can chain-react
 656  *          into a very high-latency exclusive lock.
 657  *
 658  *          This is very evident in dsynth's initial scans.
 659  */
 660 static __inline
 661 int
 662 _cache_lock_shared_special(struct namecache *ncp)
 663 {
 664         /*
 665          * Only honor a successful shared lock (returning 0) if there is
 666          * no exclusive request pending and the vnode, if present, is not
 667          * in a reclaimed state.
 668          */
 669         if (_cache_lock_shared_nonblock(ncp) == 0) {
 670                 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) {
 671                         if (ncp->nc_vp == NULL ||
 672                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
 673                                 return(0);
 674                         }
 675                 }
 676                 _cache_unlock(ncp);
 677                 return(EWOULDBLOCK);
 678         }
 679
 680         /*
 681          * Non-blocking shared lock failed.  If we already own the exclusive
 682          * lock just acquire another exclusive lock (instead of deadlocking).
 683          * Otherwise acquire a shared lock.
 684          */
 685         if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) {
 686                 _cache_lock(ncp);
 687                 return(0);
 688         }
 689         _cache_lock_shared(ncp);
 690         return(0);
 691 }
 692
 693 /*
 694  * Returns:
 695  *      -1      Locked by other
 696  *       0      Not locked
 697  *      (v)     LK_SHARED or LK_EXCLUSIVE
 698  */
 699 static __inline
 700 int
 701 _cache_lockstatus(struct namecache *ncp)
 702 {
 703         int status;
 704
 705         status = lockstatus(&ncp->nc_lock, curthread);
 706         if (status == LK_EXCLOTHER)
 707                 status = -1;
 708         return status;
 709 }
 710
 711 /*
 712  * cache_hold() and cache_drop() prevent the premature deletion of a
 713  * namecache entry but do not prevent operations (such as zapping) on
 714  * that namecache entry.
 715  *
 716  * This routine may only be called from outside this source module if
 717  * nc_refs is already deterministically at least 1, such as being
 718  * associated with e.g. a process, file descriptor, or some other entity.
 719  *
 720  * Only the above situations, similar situations within this module where
 721  * the ref count is deterministically at least 1, or when the ncp is found
 722  * via the nchpp (hash table) lookup, can bump nc_refs.
 723  *
 724  * Very specifically, a ncp found via nc_list CANNOT bump nc_refs.  It
 725  * can still be removed from the nc_list, however, as long as the caller
 726  * can acquire its lock (in the wrong order).
 727  *
 728  * This is a rare case where callers are allowed to hold a spinlock,
 729  * so we can't ourselves.
 730  */
 731 static __inline
 732 struct namecache *
 733 _cache_hold(struct namecache *ncp)
 734 {
 735         KKASSERT(ncp->nc_refs > 0);
 736         atomic_add_int(&ncp->nc_refs, 1);
 737
 738         return(ncp);
 739 }
 740
 741 /*
 742  * Drop a cache entry.
 743  *
 744  * The 1->0 transition can only occur after or because the natural ref
 745  * is being dropped.  If another thread had a temporary ref during the
 746  * ncp's destruction, then that other thread might wind up being the
 747  * one to drop the last ref.
 748  */
 749 static __inline
 750 void
 751 _cache_drop(struct namecache *ncp)
 752 {
 753         if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) {
 754                 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
 755
 756                 /*
 757                  * Scrap it.
 758                  */
 759                 ncp->nc_refs = -1;      /* safety */
 760                 if (ncp->nc_name)
 761                         kfree(ncp->nc_name, M_VFSCACHEAUX);
 762                 kfree_obj(ncp, M_VFSCACHE);
 763         }
 764 }
 765
 766 /*
 767  * Link a new namecache entry to its parent and to the hash table.  Be
 768  * careful to avoid races if vhold() blocks in the future.
 769  *
 770  * Both ncp and par must be referenced and locked.  The reference is
 771  * transfered to the nchpp (and, most notably, NOT to the parent list).
 772  *
 773  * NOTE: The hash table spinlock is held across this call, we can't do
 774  *       anything fancy.
 775  */
 776 static void
 777 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 778                    struct nchash_head *nchpp)
 779 {
 780         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 781
 782         KKASSERT(ncp->nc_parent == NULL);
 783         _cache_ncp_gen_enter(ncp);
 784         ncp->nc_parent = par;
 785         ncp->nc_head = nchpp;
 786
 787         /*
 788          * Set inheritance flags.  Note that the parent flags may be
 789          * stale due to getattr potentially not having been run yet
 790          * (it gets run during nlookup()'s).
 791          */
 792         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 793         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 794                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 795         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 796                 ncp->nc_flag |= NCF_UF_PCACHE;
 797
 798         /*
 799          * Add to hash table and parent, adjust accounting
 800          */
 801         TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 802         atomic_add_long(&pn->vfscache_count, 1);
 803
 804         /*
 805          * ncp is a new leaf being added to the tree
 806          */
 807         if (TAILQ_EMPTY(&ncp->nc_list)) {
 808                 atomic_add_long(&pn->vfscache_leafs, 1);
 809                 if (ncp->nc_flag & NCF_UNRESOLVED)
 810                         atomic_add_long(&pn->vfscache_unres, 1);
 811         }
 812
 813         if (TAILQ_EMPTY(&par->nc_list)) {
 814                 /*
 815                  * Parent was, but now is no longer a leaf
 816                  */
 817                 /*
 818                  * XXX for now don't mess with par's gen, it causes
 819                  * unnecessary nlookup retries (though not many)
 820                  */
 821                 /*_cache_ncp_gen_enter(par);*/
 822                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 823                 if (par->nc_parent) {
 824                         if (par->nc_flag & NCF_UNRESOLVED)
 825                                 atomic_add_long(&pn->vfscache_unres, -1);
 826                         atomic_add_long(&pn->vfscache_leafs, -1);
 827                 }
 828
 829                 /*
 830                  * Any vp associated with an ncp which has children must
 831                  * be held to prevent it from being recycled.
 832                  */
 833                 if (par->nc_vp)
 834                         vhold(par->nc_vp);
 835                 /*_cache_ncp_gen_exit(par);*/
 836         } else {
 837                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 838         }
 839         _cache_hold(par);       /* add nc_parent ref */
 840         _cache_ncp_gen_exit(ncp);
 841 }
 842
 843 /*
 844  * Remove the parent and hash associations from a namecache structure.
 845  * Drop the ref-count on the parent.  The caller receives the ref
 846  * from the ncp's nchpp linkage that was removed and may forward that
 847  * ref to a new linkage.
 848
 849  * The caller usually holds an additional ref * on the ncp so the unlink
 850  * cannot be the final drop.  XXX should not be necessary now since the
 851  * caller receives the ref from the nchpp linkage, assuming the ncp
 852  * was linked in the first place.
 853  *
 854  * ncp must be locked, which means that there won't be any nc_parent
 855  * removal races.  This routine will acquire a temporary lock on
 856  * the parent as well as the appropriate hash chain.
 857  *
 858  * par must be locked and will remain locked on return.
 859  *
 860  * nhcpp must be spin-locked.  This routine eats the spin-lock.
 861  */
 862 static __inline void
 863 _cache_unlink_parent(struct namecache *par, struct namecache *ncp,
 864                      struct nchash_head *nchpp)
 865 {
 866         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 867         struct vnode *dropvp;
 868
 869         KKASSERT(ncp->nc_parent == par);
 870         cpu_ccfence();
 871         _cache_ncp_gen_enter(ncp);
 872
 873         /* don't add a ref, we drop the nchpp ref later */
 874
 875         /*
 876          * Remove from hash table and parent, adjust accounting
 877          */
 878         TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
 879         TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 880         atomic_add_long(&pn->vfscache_count, -1);
 881
 882         /*
 883          * Removing leaf from tree
 884          */
 885         if (TAILQ_EMPTY(&ncp->nc_list)) {
 886                 if (ncp->nc_flag & NCF_UNRESOLVED)
 887                         atomic_add_long(&pn->vfscache_unres, -1);
 888                 atomic_add_long(&pn->vfscache_leafs, -1);
 889         }
 890
 891         /*
 892          * Parent is now a leaf?
 893          */
 894         dropvp = NULL;
 895         if (TAILQ_EMPTY(&par->nc_list)) {
 896                 /*
 897                  * XXX for now don't mess with par's gen, it causes
 898                  * unnecessary nlookup retries (though not many)
 899                  */
 900                 /*_cache_ncp_gen_enter(par);*/
 901                 if (par->nc_parent) {
 902                         if (par->nc_flag & NCF_UNRESOLVED)
 903                                 atomic_add_long(&pn->vfscache_unres, 1);
 904                         atomic_add_long(&pn->vfscache_leafs, 1);
 905                 }
 906                 if (par->nc_vp)
 907                         dropvp = par->nc_vp;
 908                 /*_cache_ncp_gen_exit(par);*/
 909         }
 910         ncp->nc_parent = NULL;
 911         ncp->nc_head = NULL;
 912         spin_unlock(&nchpp->spin);
 913         _cache_drop(par);       /* drop ncp's nc_parent ref from (par) */
 914
 915         /*
 916          * We can only safely vdrop with no spinlocks held.
 917          */
 918         if (dropvp)
 919                 vdrop(dropvp);
 920         _cache_ncp_gen_exit(ncp);
 921 }
 922
 923 /*
 924  * Allocate a new namecache structure.  Most of the code does not require
 925  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 926  *
 927  * The returned ncp will be locked and referenced.  The ref is generally meant
 928  * to be transfered to the nchpp linkage.
 929  */
 930 static struct namecache *
 931 cache_alloc(int nlen)
 932 {
 933         struct namecache *ncp;
 934
 935         ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 936         if (nlen)
 937                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK);
 938         ncp->nc_nlen = nlen;
 939         ncp->nc_flag = NCF_UNRESOLVED;
 940         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 941         ncp->nc_refs = 1;               /* natural ref */
 942         ncp->nc_generation = 0;         /* link/unlink/res/unres op */
 943         TAILQ_INIT(&ncp->nc_list);
 944         lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE);
 945         lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 946
 947         return(ncp);
 948 }
 949
 950 /*
 951  * Can only be called for the case where the ncp has never been
 952  * associated with anything (so no spinlocks are needed).
 953  */
 954 static void
 955 _cache_free(struct namecache *ncp)
 956 {
 957         KKASSERT(ncp->nc_refs == 1);
 958         if (ncp->nc_name)
 959                 kfree(ncp->nc_name, M_VFSCACHEAUX);
 960         kfree_obj(ncp, M_VFSCACHE);
 961 }
 962
 963 /*
 964  * [re]initialize a nchandle.
 965  */
 966 void
 967 cache_zero(struct nchandle *nch)
 968 {
 969         nch->ncp = NULL;
 970         nch->mount = NULL;
 971 }
 972
 973 /*
 974  * Ref and deref a nchandle structure (ncp + mp)
 975  *
 976  * The caller must specify a stable ncp pointer, typically meaning the
 977  * ncp is already referenced but this can also occur indirectly through
 978  * e.g. holding a lock on a direct child.
 979  *
 980  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 981  *          use read spinlocks here.
 982  */
 983 struct nchandle *
 984 cache_hold(struct nchandle *nch)
 985 {
 986         _cache_hold(nch->ncp);
 987         _cache_mntref(nch->mount);
 988         return(nch);
 989 }
 990
 991 /*
 992  * Create a copy of a namecache handle for an already-referenced
 993  * entry.
 994  */
 995 void
 996 cache_copy(struct nchandle *nch, struct nchandle *target)
 997 {
 998         struct namecache *ncp;
 999         struct mount *mp;
1000         struct mntcache_elm *elm;
1001         struct namecache *ncpr;
1002         int i;
1003
1004         ncp = nch->ncp;
1005         mp = nch->mount;
1006         target->ncp = ncp;
1007         target->mount = mp;
1008
1009         elm = _cache_mntcache_hash(ncp);
1010         for (i = 0; i < MNTCACHE_SET; ++i) {
1011                 if (elm->ncp == ncp) {
1012                         ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL);
1013                         if (ncpr == ncp) {
1014                                 _cache_mntref(mp);
1015                                 return;
1016                         }
1017                         if (ncpr)
1018                                 _cache_drop(ncpr);
1019                 }
1020                 ++elm;
1021         }
1022         if (ncp)
1023                 _cache_hold(ncp);
1024         _cache_mntref(mp);
1025 }
1026
1027 /*
1028  * Drop the nchandle, but try to cache the ref to avoid global atomic
1029  * ops.  This is typically done on the system root and jail root nchandles.
1030  */
1031 void
1032 cache_drop_and_cache(struct nchandle *nch, int elmno)
1033 {
1034         struct mntcache_elm *elm;
1035         struct mntcache_elm *best;
1036         struct namecache *ncpr;
1037         int delta1;
1038         int delta2;
1039         int i;
1040
1041         if (elmno > 4) {
1042                 if (nch->ncp) {
1043                         _cache_drop(nch->ncp);
1044                         nch->ncp = NULL;
1045                 }
1046                 if (nch->mount) {
1047                         _cache_mntrel(nch->mount);
1048                         nch->mount = NULL;
1049                 }
1050                 return;
1051         }
1052
1053         elm = _cache_mntcache_hash(nch->ncp);
1054         best = elm;
1055         for (i = 0; i < MNTCACHE_SET; ++i) {
1056                 if (elm->ncp == NULL) {
1057                         ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp);
1058                         _cache_mntrel(nch->mount);
1059                         elm->ticks = ticks;
1060                         nch->mount = NULL;
1061                         nch->ncp = NULL;
1062                         if (ncpr)
1063                                 _cache_drop(ncpr);
1064                         return;
1065                 }
1066                 delta1 = ticks - best->ticks;
1067                 delta2 = ticks - elm->ticks;
1068                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
1069                         best = elm;
1070                 ++elm;
1071         }
1072         ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp);
1073         _cache_mntrel(nch->mount);
1074         best->ticks = ticks;
1075         nch->mount = NULL;
1076         nch->ncp = NULL;
1077         if (ncpr)
1078                 _cache_drop(ncpr);
1079 }
1080
1081 void
1082 cache_changemount(struct nchandle *nch, struct mount *mp)
1083 {
1084         _cache_mntref(mp);
1085         _cache_mntrel(nch->mount);
1086         nch->mount = mp;
1087 }
1088
1089 void
1090 cache_drop(struct nchandle *nch)
1091 {
1092         _cache_mntrel(nch->mount);
1093         _cache_drop(nch->ncp);
1094         nch->ncp = NULL;
1095         nch->mount = NULL;
1096 }
1097
1098 /*
1099  * Returns:
1100  *      -1      Locked by other
1101  *       0      Not locked
1102  *      (v)     LK_SHARED or LK_EXCLUSIVE
1103  */
1104 int
1105 cache_lockstatus(struct nchandle *nch)
1106 {
1107         return(_cache_lockstatus(nch->ncp));
1108 }
1109
1110 void
1111 cache_lock(struct nchandle *nch)
1112 {
1113         _cache_lock(nch->ncp);
1114 }
1115
1116 /*
1117  * Returns a shared or exclusive-locked ncp.  The ncp will only be
1118  * shared-locked if it is already resolved.
1119  */
1120 void
1121 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1122 {
1123         struct namecache *ncp = nch->ncp;
1124
1125         if (ncp_shared_lock_disable || excl ||
1126             (ncp->nc_flag & NCF_UNRESOLVED)) {
1127                 _cache_lock(ncp);
1128         } else {
1129                 _cache_lock_shared(ncp);
1130                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1131                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1132                                 _cache_unlock(ncp);
1133                                 _cache_lock(ncp);
1134                         }
1135                 } else {
1136                         _cache_unlock(ncp);
1137                         _cache_lock(ncp);
1138                 }
1139         }
1140 }
1141
1142 /*
1143  * Lock fncpd, fncp, tncpd, and tncp.  tncp is already locked but may
1144  * have to be cycled to avoid deadlocks.  Make sure all four are resolved.
1145  *
1146  * The caller is responsible for checking the validity upon return as
1147  * the records may have been flagged DESTROYED in the interim.
1148  *
1149  * Namecache lock ordering is leaf first, then parent.  However, complex
1150  * interactions may occur between the source and target because there is
1151  * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp).
1152  */
1153 void
1154 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp,
1155                        struct nchandle *tncpd, struct nchandle *tncp,
1156                        struct ucred *fcred, struct ucred *tcred)
1157 {
1158         int tlocked = 1;
1159         u_int dummy_gen = 0;
1160
1161         /*
1162          * Lock tncp and tncpd
1163          *
1164          * NOTE: Because these ncps are not locked to begin with, it is
1165          *       possible for other rename races to cause the normal lock
1166          *       order assumptions to fail.
1167          *
1168          * NOTE: Lock ordering assumptions are valid if a leaf's parent
1169          *       matches after the leaf has been locked.  However, ordering
1170          *       between the 'from' and the 'to' is not and an overlapping
1171          *       lock order reversal is still possible.
1172          */
1173 again:
1174         if (__predict_false(tlocked == 0)) {
1175                 cache_lock(tncp);
1176         }
1177         if (__predict_false(cache_lock_nonblock(tncpd) != 0)) {
1178                 cache_unlock(tncp);
1179                 cache_lock(tncpd);      /* cycle tncpd lock */
1180                 cache_unlock(tncpd);
1181                 tlocked = 0;
1182                 goto again;
1183         }
1184
1185         /*
1186          * Lock fncp and fncpd
1187          *
1188          * NOTE: Because these ncps are not locked to begin with, it is
1189          *       possible for other rename races to cause the normal lock
1190          *       order assumptions to fail.
1191          *
1192          * NOTE: Lock ordering assumptions are valid if a leaf's parent
1193          *       matches after the leaf has been locked.  However, ordering
1194          *       between the 'from' and the 'to' is not and an overlapping
1195          *       lock order reversal is still possible.
1196          */
1197         if (__predict_false(cache_lock_nonblock(fncp) != 0)) {
1198                 cache_unlock(tncpd);
1199                 cache_unlock(tncp);
1200                 cache_lock(fncp);       /* cycle fncp lock */
1201                 cache_unlock(fncp);
1202                 tlocked = 0;
1203                 goto again;
1204         }
1205
1206         if (__predict_false(cache_lock_nonblock(fncpd) != 0)) {
1207                 cache_unlock(fncp);
1208                 cache_unlock(tncpd);
1209                 cache_unlock(tncp);
1210                 cache_lock(fncpd);
1211                 cache_unlock(fncpd);    /* cycle fncpd lock */
1212                 tlocked = 0;
1213                 goto again;
1214         }
1215
1216         if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0))
1217                 cache_resolve(fncpd, &dummy_gen, fcred);
1218         if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0))
1219                 cache_resolve(tncpd, &dummy_gen, tcred);
1220         if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0))
1221                 cache_resolve(fncp, &dummy_gen, fcred);
1222         if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0))
1223                 cache_resolve(tncp, &dummy_gen, tcred);
1224 }
1225
1226 int
1227 cache_lock_nonblock(struct nchandle *nch)
1228 {
1229         return(_cache_lock_nonblock(nch->ncp));
1230 }
1231
1232 void
1233 cache_unlock(struct nchandle *nch)
1234 {
1235         _cache_unlock(nch->ncp);
1236 }
1237
1238 /*
1239  * ref-and-lock, unlock-and-deref functions.
1240  *
1241  * This function is primarily used by nlookup.  Even though cache_lock
1242  * holds the vnode, it is possible that the vnode may have already
1243  * initiated a recyclement.
1244  *
1245  * We want cache_get() to return a definitively usable vnode or a
1246  * definitively unresolved ncp.
1247  */
1248 static
1249 struct namecache *
1250 _cache_get(struct namecache *ncp)
1251 {
1252         _cache_hold(ncp);
1253         _cache_lock(ncp);
1254         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1255                 _cache_setunresolved(ncp, 1);
1256         return(ncp);
1257 }
1258
1259 /*
1260  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1261  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1262  * valid.  Otherwise an exclusive lock will be acquired instead.
1263  */
1264 static
1265 struct namecache *
1266 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1267 {
1268         if (ncp_shared_lock_disable || excl ||
1269             (ncp->nc_flag & NCF_UNRESOLVED))
1270         {
1271                 return(_cache_get(ncp));
1272         }
1273         _cache_hold(ncp);
1274         _cache_lock_shared(ncp);
1275         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1276                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1277                         _cache_unlock(ncp);
1278                         ncp = _cache_get(ncp);
1279                         _cache_drop(ncp);
1280                 }
1281         } else {
1282                 _cache_unlock(ncp);
1283                 ncp = _cache_get(ncp);
1284                 _cache_drop(ncp);
1285         }
1286         return(ncp);
1287 }
1288
1289 /*
1290  * NOTE: The same nchandle can be passed for both arguments.
1291  */
1292 void
1293 cache_get(struct nchandle *nch, struct nchandle *target)
1294 {
1295         KKASSERT(nch->ncp->nc_refs > 0);
1296         target->mount = nch->mount;
1297         target->ncp = _cache_get(nch->ncp);
1298         _cache_mntref(target->mount);
1299 }
1300
1301 void
1302 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1303 {
1304         KKASSERT(nch->ncp->nc_refs > 0);
1305         target->mount = nch->mount;
1306         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1307         _cache_mntref(target->mount);
1308 }
1309
1310 /*
1311  * Release a held and locked ncp
1312  */
1313 static __inline
1314 void
1315 _cache_put(struct namecache *ncp)
1316 {
1317         _cache_unlock(ncp);
1318         _cache_drop(ncp);
1319 }
1320
1321 void
1322 cache_put(struct nchandle *nch)
1323 {
1324         _cache_mntrel(nch->mount);
1325         _cache_put(nch->ncp);
1326         nch->ncp = NULL;
1327         nch->mount = NULL;
1328 }
1329
1330 /*
1331  * Resolve an unresolved ncp by associating a vnode with it.  If the
1332  * vnode is NULL, a negative cache entry is created.
1333  *
1334  * The ncp should be locked on entry and will remain locked on return.
1335  */
1336 static
1337 void
1338 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp,
1339              int adjgen)
1340 {
1341         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1342
1343         KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) &&
1344                  (_cache_lockstatus(ncp) == LK_EXCLUSIVE) &&
1345                  ncp->nc_vp == NULL);
1346
1347         if (adjgen)
1348                 _cache_ncp_gen_enter(ncp);
1349
1350         if (vp) {
1351                 /*
1352                  * Any vp associated with an ncp which has children must
1353                  * be held.  Any vp associated with a locked ncp must be held.
1354                  */
1355                 if (!TAILQ_EMPTY(&ncp->nc_list))
1356                         vhold(vp);
1357                 spin_lock(&vp->v_spin);
1358                 ncp->nc_vp = vp;
1359                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1360                 ++vp->v_namecache_count;
1361                 _cache_hold(ncp);               /* v_namecache assoc */
1362                 spin_unlock(&vp->v_spin);
1363                 vhold(vp);                      /* nc_vp */
1364
1365                 /*
1366                  * Set auxiliary flags
1367                  */
1368                 switch(vp->v_type) {
1369                 case VDIR:
1370                         ncp->nc_flag |= NCF_ISDIR;
1371                         break;
1372                 case VLNK:
1373                         ncp->nc_flag |= NCF_ISSYMLINK;
1374                         /* XXX cache the contents of the symlink */
1375                         break;
1376                 default:
1377                         break;
1378                 }
1379
1380                 ncp->nc_error = 0;
1381
1382                 /*
1383                  * XXX: this is a hack to work-around the lack of a real pfs vfs
1384                  * implementation
1385                  */
1386                 if (mp) {
1387                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1388                                 vp->v_pfsmp = mp;
1389                 }
1390         } else {
1391                 /*
1392                  * When creating a negative cache hit we set the
1393                  * namecache_gen.  A later resolve will clean out the
1394                  * negative cache hit if the mount point's namecache_gen
1395                  * has changed.  Used by devfs, could also be used by
1396                  * other remote FSs.
1397                  */
1398                 ncp->nc_vp = NULL;
1399                 ncp->nc_negcpu = mycpu->gd_cpuid;
1400                 spin_lock(&pn->neg_spin);
1401                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1402                 _cache_hold(ncp);       /* neg_list assoc */
1403                 ++pn->neg_count;
1404                 spin_unlock(&pn->neg_spin);
1405                 atomic_add_long(&pn->vfscache_negs, 1);
1406
1407                 ncp->nc_error = ENOENT;
1408                 if (mp)
1409                         VFS_NCPGEN_SET(mp, ncp);
1410         }
1411
1412         /*
1413          * Previously unresolved leaf is now resolved.
1414          *
1415          * Clear the NCF_UNRESOLVED flag last (see cache_nlookup_nonlocked()).
1416          * We only adjust vfscache_unres for ncp's that are in the tree.
1417          */
1418         if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent)
1419                 atomic_add_long(&pn->vfscache_unres, -1);
1420         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1421         if (adjgen)
1422                 _cache_ncp_gen_exit(ncp);
1423 }
1424
1425 void
1426 cache_setvp(struct nchandle *nch, struct vnode *vp)
1427 {
1428         _cache_setvp(nch->mount, nch->ncp, vp, 1);
1429 }
1430
1431 /*
1432  * Used for NFS
1433  */
1434 void
1435 cache_settimeout(struct nchandle *nch, int nticks)
1436 {
1437         struct namecache *ncp = nch->ncp;
1438
1439         if ((ncp->nc_timeout = ticks + nticks) == 0)
1440                 ncp->nc_timeout = 1;
1441 }
1442
1443 /*
1444  * Disassociate the vnode or negative-cache association and mark a
1445  * namecache entry as unresolved again.  Note that the ncp is still
1446  * left in the hash table and still linked to its parent.
1447  *
1448  * The ncp should be locked and refd on entry and will remain locked and refd
1449  * on return.
1450  *
1451  * This routine is normally never called on a directory containing children.
1452  * However, NFS often does just that in its rename() code as a cop-out to
1453  * avoid complex namespace operations.  This disconnects a directory vnode
1454  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1455  * sync.
1456  *
1457  */
1458 static
1459 void
1460 _cache_setunresolved(struct namecache *ncp, int adjgen)
1461 {
1462         struct vnode *vp;
1463
1464         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1465                 struct pcpu_ncache *pn;
1466
1467                 if (adjgen)
1468                         _cache_ncp_gen_enter(ncp);
1469
1470                 /*
1471                  * Is a resolved or destroyed leaf now becoming unresolved?
1472                  * Only adjust vfscache_unres for linked ncp's.
1473                  */
1474                 if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent) {
1475                         pn = &pcpu_ncache[mycpu->gd_cpuid];
1476                         atomic_add_long(&pn->vfscache_unres, 1);
1477                 }
1478
1479                 ncp->nc_flag |= NCF_UNRESOLVED;
1480                 ncp->nc_timeout = 0;
1481                 ncp->nc_error = ENOTCONN;
1482                 if ((vp = ncp->nc_vp) != NULL) {
1483                         spin_lock(&vp->v_spin);
1484                         ncp->nc_vp = NULL;
1485                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1486                         --vp->v_namecache_count;
1487                         spin_unlock(&vp->v_spin);
1488
1489                         /*
1490                          * Any vp associated with an ncp with children is
1491                          * held by that ncp.  Any vp associated with  ncp
1492                          * is held by that ncp.  These conditions must be
1493                          * undone when the vp is cleared out from the ncp.
1494                          */
1495                         if (!TAILQ_EMPTY(&ncp->nc_list))
1496                                 vdrop(vp);
1497                         vdrop(vp);
1498                 } else {
1499                         pn = &pcpu_ncache[ncp->nc_negcpu];
1500
1501                         atomic_add_long(&pn->vfscache_negs, -1);
1502                         spin_lock(&pn->neg_spin);
1503                         TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1504                         --pn->neg_count;
1505                         spin_unlock(&pn->neg_spin);
1506                 }
1507                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1508
1509                 if (adjgen)
1510                         _cache_ncp_gen_exit(ncp);
1511                 _cache_drop(ncp);       /* from v_namecache or neg_list */
1512         }
1513 }
1514
1515 /*
1516  * The cache_nresolve() code calls this function to automatically
1517  * set a resolved cache element to unresolved if it has timed out
1518  * or if it is a negative cache hit and the mount point namecache_gen
1519  * has changed.
1520  */
1521 static __inline int
1522 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1523 {
1524         /*
1525          * Try to zap entries that have timed out.  We have
1526          * to be careful here because locked leafs may depend
1527          * on the vnode remaining intact in a parent, so only
1528          * do this under very specific conditions.
1529          */
1530         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1531             TAILQ_EMPTY(&ncp->nc_list)) {
1532                 return 1;
1533         }
1534
1535         /*
1536          * If a resolved negative cache hit is invalid due to
1537          * the mount's namecache generation being bumped, zap it.
1538          */
1539         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1540                 return 1;
1541         }
1542
1543         /*
1544          * Otherwise we are good
1545          */
1546         return 0;
1547 }
1548
1549 static __inline void
1550 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1551 {
1552         /*
1553          * Already in an unresolved state, nothing to do.
1554          */
1555         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1556                 if (_cache_auto_unresolve_test(mp, ncp))
1557                         _cache_setunresolved(ncp, 1);
1558         }
1559 }
1560
1561 void
1562 cache_setunresolved(struct nchandle *nch)
1563 {
1564         _cache_setunresolved(nch->ncp, 1);
1565 }
1566
1567 /*
1568  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1569  * looking for matches.  This flag tells the lookup code when it must
1570  * check for a mount linkage and also prevents the directories in question
1571  * from being deleted or renamed.
1572  */
1573 static
1574 int
1575 cache_clrmountpt_callback(struct mount *mp, void *data)
1576 {
1577         struct nchandle *nch = data;
1578
1579         if (mp->mnt_ncmounton.ncp == nch->ncp)
1580                 return(1);
1581         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1582                 return(1);
1583         return(0);
1584 }
1585
1586 /*
1587  * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1588  * with a mount point.
1589  */
1590 void
1591 cache_clrmountpt(struct nchandle *nch)
1592 {
1593         int count;
1594
1595         count = mountlist_scan(cache_clrmountpt_callback, nch,
1596                                MNTSCAN_FORWARD | MNTSCAN_NOBUSY |
1597                                MNTSCAN_NOUNLOCK);
1598         if (count == 0)
1599                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1600 }
1601
1602 /*
1603  * Invalidate portions of the namecache topology given a starting entry.
1604  * The passed ncp is set to an unresolved state and:
1605  *
1606  * The passed ncp must be referenced and locked.  The routine may unlock
1607  * and relock ncp several times, and will recheck the children and loop
1608  * to catch races.  When done the passed ncp will be returned with the
1609  * reference and lock intact.
1610  *
1611  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1612  *                        that the physical underlying nodes have been
1613  *                        destroyed... as in deleted.  For example, when
1614  *                        a directory is removed.  This will cause record
1615  *                        lookups on the name to no longer be able to find
1616  *                        the record and tells the resolver to return failure
1617  *                        rather then trying to resolve through the parent.
1618  *
1619  *                        The topology itself, including ncp->nc_name,
1620  *                        remains intact.
1621  *
1622  *                        This only applies to the passed ncp, if CINV_CHILDREN
1623  *                        is specified the children are not flagged.
1624  *
1625  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1626  *                        state as well.
1627  *
1628  *                        Note that this will also have the side effect of
1629  *                        cleaning out any unreferenced nodes in the topology
1630  *                        from the leaves up as the recursion backs out.
1631  *
1632  * Note that the topology for any referenced nodes remains intact, but
1633  * the nodes will be marked as having been destroyed and will be set
1634  * to an unresolved state.
1635  *
1636  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1637  * the namecache entry may not actually be invalidated on return if it was
1638  * revalidated while recursing down into its children.  This code guarentees
1639  * that the node(s) will go through an invalidation cycle, but does not
1640  * guarentee that they will remain in an invalidated state.
1641  *
1642  * Returns non-zero if a revalidation was detected during the invalidation
1643  * recursion, zero otherwise.  Note that since only the original ncp is
1644  * locked the revalidation ultimately can only indicate that the original ncp
1645  * *MIGHT* no have been reresolved.
1646  *
1647  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1648  * have to avoid blowing out the kernel stack.  We do this by saving the
1649  * deep namecache node and aborting the recursion, then re-recursing at that
1650  * node using a depth-first algorithm in order to allow multiple deep
1651  * recursions to chain through each other, then we restart the invalidation
1652  * from scratch.
1653  */
1654
1655 struct cinvtrack {
1656         struct namecache *resume_ncp;
1657         int depth;
1658 };
1659
1660 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1661
1662 static
1663 int
1664 _cache_inval(struct namecache *ncp, int flags)
1665 {
1666         struct cinvtrack track;
1667         struct namecache *ncp2;
1668         int r;
1669
1670         track.depth = 0;
1671         track.resume_ncp = NULL;
1672
1673         for (;;) {
1674                 r = _cache_inval_internal(ncp, flags, &track);
1675                 if (track.resume_ncp == NULL)
1676                         break;
1677                 _cache_unlock(ncp);
1678                 while ((ncp2 = track.resume_ncp) != NULL) {
1679                         track.resume_ncp = NULL;
1680                         _cache_lock(ncp2);
1681                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1682                                              &track);
1683                         /*_cache_put(ncp2);*/
1684                         cache_zap(ncp2);
1685                 }
1686                 _cache_lock(ncp);
1687         }
1688         return(r);
1689 }
1690
1691 int
1692 cache_inval(struct nchandle *nch, int flags)
1693 {
1694         return(_cache_inval(nch->ncp, flags));
1695 }
1696
1697 /*
1698  * Helper for _cache_inval().  The passed ncp is refd and locked and
1699  * remains that way on return, but may be unlocked/relocked multiple
1700  * times by the routine.
1701  */
1702 static int
1703 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1704 {
1705         struct namecache *nextkid;
1706         int rcnt = 0;
1707
1708         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1709
1710         _cache_ncp_gen_enter(ncp);
1711         _cache_setunresolved(ncp, 0);
1712         if (flags & CINV_DESTROY) {
1713                 ncp->nc_flag |= NCF_DESTROYED;
1714                 cpu_sfence();
1715         }
1716
1717         while ((flags & CINV_CHILDREN) &&
1718                (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1719         ) {
1720                 struct namecache *kid;
1721                 int restart;
1722
1723                 restart = 0;
1724                 _cache_hold(nextkid);
1725                 if (++track->depth > MAX_RECURSION_DEPTH) {
1726                         track->resume_ncp = ncp;
1727                         _cache_hold(ncp);
1728                         ++rcnt;
1729                 }
1730                 while ((kid = nextkid) != NULL) {
1731                         /*
1732                          * Parent (ncp) must be locked for the iteration.
1733                          */
1734                         nextkid = NULL;
1735                         if (kid->nc_parent != ncp) {
1736                                 _cache_drop(kid);
1737                                 kprintf("cache_inval_internal restartA %s\n",
1738                                         ncp->nc_name);
1739                                 restart = 1;
1740                                 break;
1741                         }
1742                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1743                                 _cache_hold(nextkid);
1744
1745                         /*
1746                          * Parent unlocked for this section to avoid
1747                          * deadlocks.  Then lock the kid and check for
1748                          * races.
1749                          */
1750                         _cache_unlock(ncp);
1751                         if (track->resume_ncp) {
1752                                 _cache_drop(kid);
1753                                 _cache_lock(ncp);
1754                                 break;
1755                         }
1756                         _cache_lock(kid);
1757                         if (kid->nc_parent != ncp) {
1758                                 kprintf("cache_inval_internal "
1759                                         "restartB %s\n",
1760                                         ncp->nc_name);
1761                                 restart = 1;
1762                                 _cache_unlock(kid);
1763                                 _cache_drop(kid);
1764                                 _cache_lock(ncp);
1765                                 break;
1766                         }
1767                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1768                             TAILQ_FIRST(&kid->nc_list)
1769                         ) {
1770
1771                                 rcnt += _cache_inval_internal(kid,
1772                                                 flags & ~CINV_DESTROY, track);
1773                                 /*_cache_unlock(kid);*/
1774                                 /*_cache_drop(kid);*/
1775                                 cache_zap(kid);
1776                         } else {
1777                                 cache_zap(kid);
1778                         }
1779
1780                         /*
1781                          * Relock parent to continue scan
1782                          */
1783                         _cache_lock(ncp);
1784                 }
1785                 if (nextkid)
1786                         _cache_drop(nextkid);
1787                 --track->depth;
1788                 if (restart == 0)
1789                         break;
1790         }
1791
1792         /*
1793          * Someone could have gotten in there while ncp was unlocked,
1794          * retry if so.
1795          */
1796         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1797                 ++rcnt;
1798         _cache_ncp_gen_exit(ncp);
1799
1800         return (rcnt);
1801 }
1802
1803 /*
1804  * Invalidate a vnode's namecache associations.  To avoid races against
1805  * the resolver we do not invalidate a node which we previously invalidated
1806  * but which was then re-resolved while we were in the invalidation loop.
1807  *
1808  * Returns non-zero if any namecache entries remain after the invalidation
1809  * loop completed.
1810  *
1811  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1812  *       be ripped out of the topology while held, the vnode's v_namecache
1813  *       list has no such restriction.  NCP's can be ripped out of the list
1814  *       at virtually any time if not locked, even if held.
1815  *
1816  *       In addition, the v_namecache list itself must be locked via
1817  *       the vnode's spinlock.
1818  */
1819 int
1820 cache_inval_vp(struct vnode *vp, int flags)
1821 {
1822         struct namecache *ncp;
1823         struct namecache *next;
1824
1825 restart:
1826         spin_lock(&vp->v_spin);
1827         ncp = TAILQ_FIRST(&vp->v_namecache);
1828         if (ncp)
1829                 _cache_hold(ncp);
1830         while (ncp) {
1831                 /* loop entered with ncp held and vp spin-locked */
1832                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1833                         _cache_hold(next);
1834                 spin_unlock(&vp->v_spin);
1835                 _cache_lock(ncp);
1836                 if (ncp->nc_vp != vp) {
1837                         kprintf("Warning: cache_inval_vp: race-A detected on "
1838                                 "%s\n", ncp->nc_name);
1839                         _cache_put(ncp);
1840                         if (next)
1841                                 _cache_drop(next);
1842                         goto restart;
1843                 }
1844                 _cache_inval(ncp, flags);
1845                 _cache_put(ncp);                /* also releases reference */
1846                 ncp = next;
1847                 spin_lock(&vp->v_spin);
1848                 if (ncp && ncp->nc_vp != vp) {
1849                         spin_unlock(&vp->v_spin);
1850                         kprintf("Warning: cache_inval_vp: race-B detected on "
1851                                 "%s\n", ncp->nc_name);
1852                         _cache_drop(ncp);
1853                         goto restart;
1854                 }
1855         }
1856         spin_unlock(&vp->v_spin);
1857         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1858 }
1859
1860 /*
1861  * This routine is used instead of the normal cache_inval_vp() when we
1862  * are trying to recycle otherwise good vnodes.
1863  *
1864  * Return 0 on success, non-zero if not all namecache records could be
1865  * disassociated from the vnode (for various reasons).
1866  */
1867 int
1868 cache_inval_vp_nonblock(struct vnode *vp)
1869 {
1870         struct namecache *ncp;
1871         struct namecache *next;
1872
1873         spin_lock(&vp->v_spin);
1874
1875         ncp = TAILQ_FIRST(&vp->v_namecache);
1876         if (ncp)
1877                 _cache_hold(ncp);
1878
1879         while (ncp) {
1880                 /* loop entered with ncp held */
1881                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1882                         _cache_hold(next);
1883                 spin_unlock(&vp->v_spin);
1884                 if (_cache_lock_nonblock(ncp)) {
1885                         _cache_drop(ncp);
1886                         if (next)
1887                                 _cache_drop(next);
1888                         goto done;
1889                 }
1890                 if (ncp->nc_vp != vp) {
1891                         kprintf("Warning: cache_inval_vp: race-A detected on "
1892                                 "%s\n", ncp->nc_name);
1893                         _cache_put(ncp);
1894                         if (next)
1895                                 _cache_drop(next);
1896                         goto done;
1897                 }
1898                 _cache_inval(ncp, 0);
1899                 _cache_put(ncp);                /* also releases reference */
1900                 ncp = next;
1901                 spin_lock(&vp->v_spin);
1902                 if (ncp && ncp->nc_vp != vp) {
1903                         spin_unlock(&vp->v_spin);
1904                         kprintf("Warning: cache_inval_vp: race-B detected on "
1905                                 "%s\n", ncp->nc_name);
1906                         _cache_drop(ncp);
1907                         goto done;
1908                 }
1909         }
1910         spin_unlock(&vp->v_spin);
1911 done:
1912         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1913 }
1914
1915 /*
1916  * Attempt to quickly invalidate the vnode's namecache entry.  This function
1917  * will also dive the ncp and free its children but only if they are trivial.
1918  * All locks are non-blocking and the function will fail if required locks
1919  * cannot be obtained.
1920  *
1921  * We want this sort of function to be able to guarantee progress when vnlru
1922  * wants to recycle a vnode.  Directories could otherwise get stuck and not
1923  * be able to recycle due to destroyed or unresolved children in the
1924  * namecache.
1925  */
1926 void
1927 cache_inval_vp_quick(struct vnode *vp)
1928 {
1929         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1930         struct namecache *ncp;
1931         struct namecache *kid;
1932
1933         spin_lock(&vp->v_spin);
1934         while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
1935                 _cache_hold(ncp);
1936                 spin_unlock(&vp->v_spin);
1937                 if (_cache_lock_nonblock(ncp)) {
1938                         _cache_drop(ncp);
1939                         return;
1940                 }
1941
1942                 /*
1943                  * Try to trivially destroy any children.
1944                  */
1945                 while ((kid = TAILQ_FIRST(&ncp->nc_list)) != NULL) {
1946                         struct nchash_head *nchpp;
1947
1948                         /*
1949                          * Early test without the lock.  Give-up if the
1950                          * child has children of its own, the child is
1951                          * positively-resolved, or the ref-count is
1952                          * unexpected.
1953                          */
1954                         if (TAILQ_FIRST(&kid->nc_list) ||
1955                             kid->nc_vp ||
1956                             kid->nc_refs != ncpbaserefs(kid))
1957                         {
1958                                 _cache_put(ncp);
1959                                 return;
1960                         }
1961
1962                         _cache_hold(kid);
1963                         if (_cache_lock_nonblock(kid)) {
1964                                 _cache_drop(kid);
1965                                 _cache_put(ncp);
1966                                 return;
1967                         }
1968
1969                         /*
1970                          * A destruction/free test requires the parent,
1971                          * the kid, and the hash table to be locked.  Note
1972                          * that the kid may still be on the negative cache
1973                          * list.
1974                          */
1975                         nchpp = kid->nc_head;
1976                         spin_lock(&nchpp->spin);
1977
1978                         /*
1979                          * Give up if the child isn't trivial.  It can be
1980                          * resolved or unresolved but must not have a vp.
1981                          */
1982                         if (kid->nc_parent != ncp ||
1983                             kid->nc_vp ||
1984                             TAILQ_FIRST(&kid->nc_list) ||
1985                             kid->nc_refs != 1 + ncpbaserefs(kid))
1986                         {
1987                                 spin_unlock(&nchpp->spin);
1988                                 _cache_put(kid);
1989                                 _cache_put(ncp);
1990                                 return;
1991                         }
1992
1993                         ++pn->inv_kid_quick_count;
1994
1995                         /*
1996                          * We can safely destroy the kid.  It may still
1997                          * have extra refs due to ncneglist races, but since
1998                          * we checked above with the lock held those races
1999                          * will self-resolve.
2000                          *
2001                          * With these actions the kid should nominally
2002                          * have just its natural ref plus our ref.
2003                          *
2004                          * This is only safe because we hold locks on
2005                          * the parent, the kid, and the nchpp.  The only
2006                          * lock we don't have is on the ncneglist and that
2007                          * can race a ref, but as long as we unresolve the
2008                          * kid before executing our final drop the ncneglist
2009                          * code path(s) will just drop their own ref so all
2010                          * is good.
2011                          */
2012                         _cache_unlink_parent(ncp, kid, nchpp);
2013                         _cache_setunresolved(kid, 1);
2014                         if (kid->nc_refs != 2) {
2015                                 kprintf("Warning: kid %p unexpected refs=%d "
2016                                         "%08x %s\n",
2017                                         kid, kid->nc_refs,
2018                                         kid->nc_flag, kid->nc_name);
2019                         }
2020                         _cache_put(kid);    /* drop our ref and lock */
2021                         _cache_drop(kid);   /* drop natural ref to destroy */
2022                 }
2023
2024                 /*
2025                  * Now check ncp itself against our expectations.  With
2026                  * no children left we have our ref plus whether it is
2027                  * resolved or not (which it has to be, actually, since it
2028                  * is hanging off the vp->v_namecache).
2029                  */
2030                 if (ncp->nc_refs != 1 + ncpbaserefs(ncp)) {
2031                         _cache_put(ncp);
2032                         spin_lock(&vp->v_spin);
2033                         break;
2034                 }
2035
2036                 ++pn->inv_ncp_quick_count;
2037
2038                 /*
2039                  * Success, disassociate and release the ncp.  Do not
2040                  * try to zap it here.
2041                  *
2042                  * NOTE: Releasing the ncp here leaves it in the tree,
2043                  *       but since we have disassociated the vnode this
2044                  *       ncp entry becomes 'trivial' and successive calls
2045                  *       to cache_inval_vp_quick() will be able to continue
2046                  *       to make progress.
2047                  */
2048                 _cache_setunresolved(ncp, 1);
2049                 _cache_put(ncp);
2050                 spin_lock(&vp->v_spin);
2051         }
2052         spin_unlock(&vp->v_spin);
2053 }
2054
2055 /*
2056  * Clears the universal directory search 'ok' flag.  This flag allows
2057  * nlookup() to bypass normal vnode checks.  This flag is a cached flag
2058  * so clearing it simply forces revalidation.
2059  */
2060 void
2061 cache_inval_wxok(struct vnode *vp)
2062 {
2063         struct namecache *ncp;
2064
2065         spin_lock(&vp->v_spin);
2066         TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
2067                 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX))
2068                         atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX);
2069         }
2070         spin_unlock(&vp->v_spin);
2071 }
2072
2073 /*
2074  * The source ncp has been renamed to the target ncp.  All elements have been
2075  * locked, including the parent ncp's.
2076  *
2077  * The target ncp is destroyed (as a normal rename-over would destroy the
2078  * target file or directory).
2079  *
2080  * Because there may be references to the source ncp we cannot copy its
2081  * contents to the target.  Instead the source ncp is relinked as the target
2082  * and the target ncp is removed from the namecache topology.
2083  */
2084 void
2085 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
2086 {
2087         struct namecache *fncp = fnch->ncp;
2088         struct namecache *tncp = tnch->ncp;
2089         struct namecache *par;
2090         struct nchash_head *nchpp;
2091         u_int32_t hash;
2092         char *oname;
2093         char *nname;
2094
2095         if (tncp->nc_nlen) {
2096                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK);
2097                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
2098                 nname[tncp->nc_nlen] = 0;
2099         } else {
2100                 nname = NULL;
2101         }
2102
2103         /*
2104          * Rename fncp (unlink)
2105          */
2106         if (fncp->nc_parent) {
2107                 par = fncp->nc_parent;
2108                 _cache_hold(par);
2109                 _cache_lock(par);
2110                 nchpp = fncp->nc_head;
2111                 spin_lock(&nchpp->spin);
2112                 _cache_unlink_parent(par, fncp, nchpp); /* eats nchpp */
2113                 _cache_put(par);
2114         } else {
2115                 par = NULL;
2116                 nchpp = NULL;
2117         }
2118         oname = fncp->nc_name;
2119         fncp->nc_name = nname;
2120         fncp->nc_nlen = tncp->nc_nlen;
2121         if (oname)
2122                 kfree(oname, M_VFSCACHEAUX);
2123
2124         par = tncp->nc_parent;
2125         KKASSERT(par->nc_lock.lk_lockholder == curthread);
2126
2127         /*
2128          * Rename fncp (relink)
2129          */
2130         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
2131         hash = fnv_32_buf(&par, sizeof(par), hash);
2132         nchpp = NCHHASH(hash);
2133
2134         spin_lock(&nchpp->spin);
2135         _cache_link_parent(fncp, par, nchpp);
2136         spin_unlock(&nchpp->spin);
2137
2138         /*
2139          * Get rid of the overwritten tncp (unlink)
2140          */
2141         _cache_unlink(tncp);
2142 }
2143
2144 /*
2145  * Perform actions consistent with unlinking a file.  The passed-in ncp
2146  * must be locked.
2147  *
2148  * The ncp is marked DESTROYED so it no longer shows up in searches,
2149  * and will be physically deleted when the vnode goes away.
2150  *
2151  * If the related vnode has no refs then we cycle it through vget()/vput()
2152  * to (possibly if we don't have a ref race) trigger a deactivation,
2153  * allowing the VFS to trivially detect and recycle the deleted vnode
2154  * via VOP_INACTIVE().
2155  *
2156  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
2157  *       target ncp.
2158  */
2159 void
2160 cache_unlink(struct nchandle *nch)
2161 {
2162         _cache_unlink(nch->ncp);
2163 }
2164
2165 static void
2166 _cache_unlink(struct namecache *ncp)
2167 {
2168         struct vnode *vp;
2169
2170         /*
2171          * Causes lookups to fail and allows another ncp with the same
2172          * name to be created under ncp->nc_parent.
2173          */
2174         _cache_ncp_gen_enter(ncp);
2175         ncp->nc_flag |= NCF_DESTROYED;
2176
2177         /*
2178          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
2179          * force action on the 1->0 transition.  Do not destroy the
2180          * vp association if a vp is present (leave the destroyed ncp
2181          * resolved through the vp finalization).
2182          *
2183          * Cleanup the refs in the resolved-not-found case by setting
2184          * the ncp to an unresolved state.  This improves our ability
2185          * to get rid of dead ncp elements in other cache_*() routines.
2186          */
2187         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
2188                 vp = ncp->nc_vp;
2189                 if (vp) {
2190                         atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
2191                         if (VREFCNT(vp) <= 0) {
2192                                 if (vget(vp, LK_SHARED) == 0)
2193                                         vput(vp);
2194                         }
2195                 } else {
2196                         _cache_setunresolved(ncp, 0);
2197                 }
2198         }
2199         _cache_ncp_gen_exit(ncp);
2200 }
2201
2202 /*
2203  * Return non-zero if the nch might be associated with an open and/or mmap()'d
2204  * file.  The easy solution is to just return non-zero if the vnode has refs.
2205  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
2206  * force the reclaim).
2207  */
2208 int
2209 cache_isopen(struct nchandle *nch)
2210 {
2211         struct vnode *vp;
2212         struct namecache *ncp = nch->ncp;
2213
2214         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2215             (vp = ncp->nc_vp) != NULL &&
2216             VREFCNT(vp)) {
2217                 return 1;
2218         }
2219         return 0;
2220 }
2221
2222
2223 /*
2224  * vget the vnode associated with the namecache entry.  Resolve the namecache
2225  * entry if necessary.  The passed ncp must be referenced and locked.  If
2226  * the ncp is resolved it might be locked shared.
2227  *
2228  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
2229  * (depending on the passed lk_type) will be returned in *vpp with an error
2230  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
2231  * most typical error is ENOENT, meaning that the ncp represents a negative
2232  * cache hit and there is no vnode to retrieve, but other errors can occur
2233  * too.
2234  *
2235  * The vget() can race a reclaim.  If this occurs we re-resolve the
2236  * namecache entry.
2237  *
2238  * There are numerous places in the kernel where vget() is called on a
2239  * vnode while one or more of its namecache entries is locked.  Releasing
2240  * a vnode never deadlocks against locked namecache entries (the vnode
2241  * will not get recycled while referenced ncp's exist).  This means we
2242  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
2243  * lock when acquiring the vp lock or we might cause a deadlock.
2244  *
2245  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2246  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2247  *       relocked exclusively before being re-resolved.
2248  */
2249 int
2250 cache_vget(struct nchandle *nch, struct ucred *cred,
2251            int lk_type, struct vnode **vpp)
2252 {
2253         struct namecache *ncp;
2254         struct vnode *vp;
2255         int error;
2256         u_int dummy_gen = 0;
2257
2258         ncp = nch->ncp;
2259 again:
2260         vp = NULL;
2261         if (ncp->nc_flag & NCF_UNRESOLVED)
2262                 error = cache_resolve(nch, &dummy_gen, cred);
2263         else
2264                 error = 0;
2265
2266         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2267                 error = vget(vp, lk_type);
2268                 if (error) {
2269                         /*
2270                          * VRECLAIM race
2271                          *
2272                          * The ncp may have been locked shared, we must relock
2273                          * it exclusively before we can set it to unresolved.
2274                          */
2275                         if (error == ENOENT) {
2276                                 kprintf("Warning: vnode reclaim race detected "
2277                                         "in cache_vget on %p (%s)\n",
2278                                         vp, ncp->nc_name);
2279                                 _cache_unlock(ncp);
2280                                 _cache_lock(ncp);
2281                                 _cache_setunresolved(ncp, 1);
2282                                 goto again;
2283                         }
2284
2285                         /*
2286                          * Not a reclaim race, some other error.
2287                          */
2288                         KKASSERT(ncp->nc_vp == vp);
2289                         vp = NULL;
2290                 } else {
2291                         KKASSERT(ncp->nc_vp == vp);
2292                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2293                 }
2294         }
2295         if (error == 0 && vp == NULL)
2296                 error = ENOENT;
2297         *vpp = vp;
2298         return(error);
2299 }
2300
2301 /*
2302  * Similar to cache_vget() but only acquires a ref on the vnode.  The vnode
2303  * is already held by virtuue of the ncp being locked, but it might not be
2304  * referenced and while it is not referenced it can transition into the
2305  * VRECLAIMED state.
2306  *
2307  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2308  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2309  *       relocked exclusively before being re-resolved.
2310  *
2311  * NOTE: At the moment we have to issue a vget() on the vnode, even though
2312  *       we are going to immediately release the lock, in order to resolve
2313  *       potential reclamation races.  Once we have a solid vnode ref that
2314  *       was (at some point) interlocked via a vget(), the vnode will not
2315  *       be reclaimed.
2316  *
2317  * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
2318  */
2319 int
2320 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
2321 {
2322         struct namecache *ncp;
2323         struct vnode *vp;
2324         int error;
2325         int v;
2326         u_int dummy_gen = 0;
2327
2328         ncp = nch->ncp;
2329 again:
2330         vp = NULL;
2331         if (ncp->nc_flag & NCF_UNRESOLVED)
2332                 error = cache_resolve(nch, &dummy_gen, cred);
2333         else
2334                 error = 0;
2335
2336         while (error == 0 && (vp = ncp->nc_vp) != NULL) {
2337                 /*
2338                  * Try a lockless ref of the vnode.  VRECLAIMED transitions
2339                  * use the vx_lock state and update-counter mechanism so we
2340                  * can detect if one is in-progress or occurred.
2341                  *
2342                  * If we can successfully ref the vnode and interlock against
2343                  * the update-counter mechanism, and VRECLAIMED is found to
2344                  * not be set after that, we should be good.
2345                  */
2346                 v = spin_access_start_only(&vp->v_spin);
2347                 if (__predict_true(spin_access_check_inprog(v) == 0)) {
2348                         vref_special(vp);
2349                         if (__predict_false(
2350                                     spin_access_end_only(&vp->v_spin, v))) {
2351                                 vrele(vp);
2352                                 continue;
2353                         }
2354                         if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) {
2355                                 break;
2356                         }
2357                         vrele(vp);
2358                         kprintf("CACHE_VREF: IN-RECLAIM\n");
2359                 }
2360
2361                 /*
2362                  * Do it the slow way
2363                  */
2364                 error = vget(vp, LK_SHARED);
2365                 if (error) {
2366                         /*
2367                          * VRECLAIM race
2368                          */
2369                         if (error == ENOENT) {
2370                                 kprintf("Warning: vnode reclaim race detected "
2371                                         "in cache_vget on %p (%s)\n",
2372                                         vp, ncp->nc_name);
2373                                 _cache_unlock(ncp);
2374                                 _cache_lock(ncp);
2375                                 _cache_setunresolved(ncp, 1);
2376                                 goto again;
2377                         }
2378
2379                         /*
2380                          * Not a reclaim race, some other error.
2381                          */
2382                         KKASSERT(ncp->nc_vp == vp);
2383                         vp = NULL;
2384                 } else {
2385                         KKASSERT(ncp->nc_vp == vp);
2386                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2387                         /* caller does not want a lock */
2388                         vn_unlock(vp);
2389                 }
2390                 break;
2391         }
2392         if (error == 0 && vp == NULL)
2393                 error = ENOENT;
2394         *vpp = vp;
2395
2396         return(error);
2397 }
2398
2399 /*
2400  * Return a referenced vnode representing the parent directory of
2401  * ncp.
2402  *
2403  * Because the caller has locked the ncp it should not be possible for
2404  * the parent ncp to go away.  However, the parent can unresolve its
2405  * dvp at any time so we must be able to acquire a lock on the parent
2406  * to safely access nc_vp.
2407  *
2408  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2409  * so use vhold()/vdrop() while holding the lock to prevent dvp from
2410  * getting destroyed.
2411  *
2412  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2413  *       lock on the ncp in question..
2414  */
2415 struct vnode *
2416 cache_dvpref(struct namecache *ncp)
2417 {
2418         struct namecache *par;
2419         struct vnode *dvp;
2420
2421         dvp = NULL;
2422         if ((par = ncp->nc_parent) != NULL) {
2423                 _cache_hold(par);
2424                 _cache_lock(par);
2425                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2426                         if ((dvp = par->nc_vp) != NULL)
2427                                 vhold(dvp);
2428                 }
2429                 _cache_unlock(par);
2430                 if (dvp) {
2431                         if (vget(dvp, LK_SHARED) == 0) {
2432                                 vn_unlock(dvp);
2433                                 vdrop(dvp);
2434                                 /* return refd, unlocked dvp */
2435                         } else {
2436                                 vdrop(dvp);
2437                                 dvp = NULL;
2438                         }
2439                 }
2440                 _cache_drop(par);
2441         }
2442         return(dvp);
2443 }
2444
2445 /*
2446  * Convert a directory vnode to a namecache record without any other
2447  * knowledge of the topology.  This ONLY works with directory vnodes and
2448  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2449  * returned ncp (if not NULL) will be held and unlocked.
2450  *
2451  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2452  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2453  * for dvp.  This will fail only if the directory has been deleted out from
2454  * under the caller.
2455  *
2456  * Callers must always check for a NULL return no matter the value of 'makeit'.
2457  *
2458  * To avoid underflowing the kernel stack each recursive call increments
2459  * the makeit variable.
2460  */
2461
2462 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2463                                   struct vnode *dvp, char *fakename);
2464 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2465                                   struct vnode **saved_dvp);
2466
2467 int
2468 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2469               struct nchandle *nch)
2470 {
2471         struct vnode *saved_dvp;
2472         struct vnode *pvp;
2473         char *fakename;
2474         int error;
2475
2476         nch->ncp = NULL;
2477         nch->mount = dvp->v_mount;
2478         saved_dvp = NULL;
2479         fakename = NULL;
2480
2481         /*
2482          * Handle the makeit == 0 degenerate case
2483          */
2484         if (makeit == 0) {
2485                 spin_lock_shared(&dvp->v_spin);
2486                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2487                 if (nch->ncp)
2488                         cache_hold(nch);
2489                 spin_unlock_shared(&dvp->v_spin);
2490         }
2491
2492         /*
2493          * Loop until resolution, inside code will break out on error.
2494          */
2495         while (makeit) {
2496                 /*
2497                  * Break out if we successfully acquire a working ncp.
2498                  */
2499                 spin_lock_shared(&dvp->v_spin);
2500                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2501                 if (nch->ncp) {
2502                         cache_hold(nch);
2503                         spin_unlock_shared(&dvp->v_spin);
2504                         break;
2505                 }
2506                 spin_unlock_shared(&dvp->v_spin);
2507
2508                 /*
2509                  * If dvp is the root of its filesystem it should already
2510                  * have a namecache pointer associated with it as a side
2511                  * effect of the mount, but it may have been disassociated.
2512                  */
2513                 if (dvp->v_flag & VROOT) {
2514                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2515                         error = cache_resolve_mp(nch->mount, 1);
2516                         _cache_put(nch->ncp);
2517                         if (ncvp_debug & 1) {
2518                                 kprintf("cache_fromdvp: resolve root of "
2519                                         "mount %p error %d",
2520                                         dvp->v_mount, error);
2521                         }
2522                         if (error) {
2523                                 if (ncvp_debug & 1)
2524                                         kprintf(" failed\n");
2525                                 nch->ncp = NULL;
2526                                 break;
2527                         }
2528                         if (ncvp_debug & 1)
2529                                 kprintf(" succeeded\n");
2530                         continue;
2531                 }
2532
2533                 /*
2534                  * If we are recursed too deeply resort to an O(n^2)
2535                  * algorithm to resolve the namecache topology.  The
2536                  * resolved pvp is left referenced in saved_dvp to
2537                  * prevent the tree from being destroyed while we loop.
2538                  */
2539                 if (makeit > 20) {
2540                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2541                         if (error) {
2542                                 kprintf("lookupdotdot(longpath) failed %d "
2543                                        "dvp %p\n", error, dvp);
2544                                 nch->ncp = NULL;
2545                                 break;
2546                         }
2547                         continue;
2548                 }
2549
2550                 /*
2551                  * Get the parent directory and resolve its ncp.
2552                  */
2553                 if (fakename) {
2554                         kfree(fakename, M_TEMP);
2555                         fakename = NULL;
2556                 }
2557                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2558                                           &fakename);
2559                 if (error) {
2560                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2561                         break;
2562                 }
2563                 vn_unlock(pvp);
2564
2565                 /*
2566                  * Reuse makeit as a recursion depth counter.  On success
2567                  * nch will be fully referenced.
2568                  */
2569                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2570                 vrele(pvp);
2571                 if (nch->ncp == NULL)
2572                         break;
2573
2574                 /*
2575                  * Do an inefficient scan of pvp (embodied by ncp) to look
2576                  * for dvp.  This will create a namecache record for dvp on
2577                  * success.  We loop up to recheck on success.
2578                  *
2579                  * ncp and dvp are both held but not locked.
2580                  */
2581                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2582                 if (error) {
2583                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2584                                 pvp, nch->ncp->nc_name, dvp);
2585                         cache_drop(nch);
2586                         /* nch was NULLed out, reload mount */
2587                         nch->mount = dvp->v_mount;
2588                         break;
2589                 }
2590                 if (ncvp_debug & 1) {
2591                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2592                                 pvp, nch->ncp->nc_name);
2593                 }
2594                 cache_drop(nch);
2595                 /* nch was NULLed out, reload mount */
2596                 nch->mount = dvp->v_mount;
2597         }
2598
2599         /*
2600          * If nch->ncp is non-NULL it will have been held already.
2601          */
2602         if (fakename)
2603                 kfree(fakename, M_TEMP);
2604         if (saved_dvp)
2605                 vrele(saved_dvp);
2606         if (nch->ncp)
2607                 return (0);
2608         return (EINVAL);
2609 }
2610
2611 /*
2612  * Go up the chain of parent directories until we find something
2613  * we can resolve into the namecache.  This is very inefficient.
2614  */
2615 static
2616 int
2617 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2618                   struct vnode **saved_dvp)
2619 {
2620         struct nchandle nch;
2621         struct vnode *pvp;
2622         int error;
2623         static time_t last_fromdvp_report;
2624         char *fakename;
2625
2626         /*
2627          * Loop getting the parent directory vnode until we get something we
2628          * can resolve in the namecache.
2629          */
2630         vref(dvp);
2631         nch.mount = dvp->v_mount;
2632         nch.ncp = NULL;
2633         fakename = NULL;
2634
2635         for (;;) {
2636                 if (fakename) {
2637                         kfree(fakename, M_TEMP);
2638                         fakename = NULL;
2639                 }
2640                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2641                                           &fakename);
2642                 if (error) {
2643                         vrele(dvp);
2644                         break;
2645                 }
2646                 vn_unlock(pvp);
2647                 spin_lock_shared(&pvp->v_spin);
2648                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2649                         _cache_hold(nch.ncp);
2650                         spin_unlock_shared(&pvp->v_spin);
2651                         vrele(pvp);
2652                         break;
2653                 }
2654                 spin_unlock_shared(&pvp->v_spin);
2655                 if (pvp->v_flag & VROOT) {
2656                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2657                         error = cache_resolve_mp(nch.mount, 1);
2658                         _cache_unlock(nch.ncp);
2659                         vrele(pvp);
2660                         if (error) {
2661                                 _cache_drop(nch.ncp);
2662                                 nch.ncp = NULL;
2663                                 vrele(dvp);
2664                         }
2665                         break;
2666                 }
2667                 vrele(dvp);
2668                 dvp = pvp;
2669         }
2670         if (error == 0) {
2671                 if (last_fromdvp_report != time_uptime) {
2672                         last_fromdvp_report = time_uptime;
2673                         kprintf("Warning: extremely inefficient path "
2674                                 "resolution on %s\n",
2675                                 nch.ncp->nc_name);
2676                 }
2677                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2678
2679                 /*
2680                  * Hopefully dvp now has a namecache record associated with
2681                  * it.  Leave it referenced to prevent the kernel from
2682                  * recycling the vnode.  Otherwise extremely long directory
2683                  * paths could result in endless recycling.
2684                  */
2685                 if (*saved_dvp)
2686                     vrele(*saved_dvp);
2687                 *saved_dvp = dvp;
2688                 _cache_drop(nch.ncp);
2689         }
2690         if (fakename)
2691                 kfree(fakename, M_TEMP);
2692         return (error);
2693 }
2694
2695 /*
2696  * Do an inefficient scan of the directory represented by ncp looking for
2697  * the directory vnode dvp.  ncp must be held but not locked on entry and
2698  * will be held on return.  dvp must be refd but not locked on entry and
2699  * will remain refd on return.
2700  *
2701  * Why do this at all?  Well, due to its stateless nature the NFS server
2702  * converts file handles directly to vnodes without necessarily going through
2703  * the namecache ops that would otherwise create the namecache topology
2704  * leading to the vnode.  We could either (1) Change the namecache algorithms
2705  * to allow disconnect namecache records that are re-merged opportunistically,
2706  * or (2) Make the NFS server backtrack and scan to recover a connected
2707  * namecache topology in order to then be able to issue new API lookups.
2708  *
2709  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2710  * namecache algorithms and introduces a lot of complication in every subsystem
2711  * that calls into the namecache to deal with the re-merge case, especially
2712  * since we are using the namecache to placehold negative lookups and the
2713  * vnode might not be immediately assigned. (2) is certainly far less
2714  * efficient then (1), but since we are only talking about directories here
2715  * (which are likely to remain cached), the case does not actually run all
2716  * that often and has the supreme advantage of not polluting the namecache
2717  * algorithms.
2718  *
2719  * If a fakename is supplied just construct a namecache entry using the
2720  * fake name.
2721  */
2722 static int
2723 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2724                        struct vnode *dvp, char *fakename)
2725 {
2726         struct nlcomponent nlc;
2727         struct nchandle rncp;
2728         struct dirent *den;
2729         struct vnode *pvp;
2730         struct vattr vat;
2731         struct iovec iov;
2732         struct uio uio;
2733         int blksize;
2734         int eofflag;
2735         int bytes;
2736         char *rbuf;
2737         int error;
2738
2739         vat.va_blocksize = 0;
2740         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2741                 return (error);
2742         cache_lock(nch);
2743         error = cache_vref(nch, cred, &pvp);
2744         cache_unlock(nch);
2745         if (error)
2746                 return (error);
2747         if (ncvp_debug & 1) {
2748                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2749                         "vattr fileid = %lld\n",
2750                         nch->ncp, nch->ncp->nc_name,
2751                         vat.va_blocksize,
2752                         (long long)vat.va_fileid);
2753         }
2754
2755         /*
2756          * Use the supplied fakename if not NULL.  Fake names are typically
2757          * not in the actual filesystem hierarchy.  This is used by HAMMER
2758          * to glue @@timestamp recursions together.
2759          */
2760         if (fakename) {
2761                 nlc.nlc_nameptr = fakename;
2762                 nlc.nlc_namelen = strlen(fakename);
2763                 rncp = cache_nlookup(nch, &nlc);
2764                 goto done;
2765         }
2766
2767         if ((blksize = vat.va_blocksize) == 0)
2768                 blksize = DEV_BSIZE;
2769         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2770         rncp.ncp = NULL;
2771
2772         eofflag = 0;
2773         uio.uio_offset = 0;
2774 again:
2775         iov.iov_base = rbuf;
2776         iov.iov_len = blksize;
2777         uio.uio_iov = &iov;
2778         uio.uio_iovcnt = 1;
2779         uio.uio_resid = blksize;
2780         uio.uio_segflg = UIO_SYSSPACE;
2781         uio.uio_rw = UIO_READ;
2782         uio.uio_td = curthread;
2783
2784         if (ncvp_debug & 2)
2785                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2786         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2787         if (error == 0) {
2788                 den = (struct dirent *)rbuf;
2789                 bytes = blksize - uio.uio_resid;
2790
2791                 while (bytes > 0) {
2792                         if (ncvp_debug & 2) {
2793                                 kprintf("cache_inefficient_scan: %*.*s\n",
2794                                         den->d_namlen, den->d_namlen,
2795                                         den->d_name);
2796                         }
2797                         if (den->d_type != DT_WHT &&
2798                             den->d_ino == vat.va_fileid) {
2799                                 if (ncvp_debug & 1) {
2800                                         kprintf("cache_inefficient_scan: "
2801                                                "MATCHED inode %lld path %s/%*.*s\n",
2802                                                (long long)vat.va_fileid,
2803                                                nch->ncp->nc_name,
2804                                                den->d_namlen, den->d_namlen,
2805                                                den->d_name);
2806                                 }
2807                                 nlc.nlc_nameptr = den->d_name;
2808                                 nlc.nlc_namelen = den->d_namlen;
2809                                 rncp = cache_nlookup(nch, &nlc);
2810                                 KKASSERT(rncp.ncp != NULL);
2811                                 break;
2812                         }
2813                         bytes -= _DIRENT_DIRSIZ(den);
2814                         den = _DIRENT_NEXT(den);
2815                 }
2816                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2817                         goto again;
2818         }
2819         kfree(rbuf, M_TEMP);
2820 done:
2821         vrele(pvp);
2822         if (rncp.ncp) {
2823                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2824                         _cache_setvp(rncp.mount, rncp.ncp, dvp, 1);
2825                         if (ncvp_debug & 2) {
2826                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2827                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2828                         }
2829                 } else {
2830                         if (ncvp_debug & 2) {
2831                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2832                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2833                                         rncp.ncp->nc_vp);
2834                         }
2835                 }
2836                 if (rncp.ncp->nc_vp == NULL)
2837                         error = rncp.ncp->nc_error;
2838                 /*
2839                  * Release rncp after a successful nlookup.  rncp was fully
2840                  * referenced.
2841                  */
2842                 cache_put(&rncp);
2843         } else {
2844                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2845                         dvp, nch->ncp->nc_name);
2846                 error = ENOENT;
2847         }
2848         return (error);
2849 }
2850
2851 /*
2852  * This function must be called with the ncp held and locked and will unlock
2853  * and drop it during zapping.
2854  *
2855  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2856  * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2857  * and removes the related reference.  If the ncp can be removed, and the
2858  * parent can be zapped non-blocking, this function loops up.
2859  *
2860  * There will be one ref from the caller (which we now own).  The only
2861  * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2862  * so possibly 2 refs left.  Taking this into account, if there are no
2863  * additional refs and no children, the ncp will be removed from the topology
2864  * and destroyed.
2865  *
2866  * References and/or children may exist if the ncp is in the middle of the
2867  * topology, preventing the ncp from being destroyed.
2868  *
2869  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2870  *
2871  * This function may return a held (but NOT locked) parent node which the
2872  * caller must drop in a loop.  Looping is one way to avoid unbounded recursion
2873  * due to deep namecache trees.
2874  *
2875  * WARNING!  For MPSAFE operation this routine must acquire up to three
2876  *           spin locks to be able to safely test nc_refs.  Lock order is
2877  *           very important.
2878  *
2879  *           hash spinlock if on hash list
2880  *           parent spinlock if child of parent
2881  *           (the ncp is unresolved so there is no vnode association)
2882  */
2883 static int
2884 cache_zap(struct namecache *ncp)
2885 {
2886         struct namecache *par;
2887         struct nchash_head *nchpp;
2888         int refcmp;
2889         int nonblock = 1;       /* XXX cleanup */
2890         int res = 0;
2891
2892 again:
2893         /*
2894          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2895          * This gets rid of any vp->v_namecache list or negative list and
2896          * the related ref.
2897          */
2898         _cache_setunresolved(ncp, 1);
2899
2900         /*
2901          * Try to scrap the entry and possibly tail-recurse on its parent.
2902          * We only scrap unref'd (other then our ref) unresolved entries,
2903          * we do not scrap 'live' entries.
2904          *
2905          * If nc_parent is non NULL we expect 2 references, else just 1.
2906          * If there are more, someone else also holds the ncp and we cannot
2907          * destroy it.
2908          */
2909         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2910         KKASSERT(ncp->nc_refs > 0);
2911
2912         /*
2913          * If the ncp is linked to its parent it will also be in the hash
2914          * table.  We have to be able to lock the parent and the hash table.
2915          *
2916          * Acquire locks.  Note that the parent can't go away while we hold
2917          * a child locked.  If nc_parent is present, expect 2 refs instead
2918          * of 1.
2919          */
2920         nchpp = NULL;
2921         if ((par = ncp->nc_parent) != NULL) {
2922                 if (nonblock) {
2923                         if (_cache_lock_nonblock(par)) {
2924                                 /* lock failed */
2925                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2926                                 atomic_add_long(
2927                                     &pcpu_ncache[mycpu->gd_cpuid].numdefered,
2928                                     1);
2929                                 _cache_unlock(ncp);
2930                                 _cache_drop(ncp);       /* caller's ref */
2931                                 return res;
2932                         }
2933                         _cache_hold(par);
2934                 } else {
2935                         _cache_hold(par);
2936                         _cache_lock(par);
2937                 }
2938                 nchpp = ncp->nc_head;
2939                 spin_lock(&nchpp->spin);
2940         }
2941
2942         /*
2943          * With the parent and nchpp locked, and the vnode removed
2944          * (no vp->v_namecache), we expect 1 or 2 refs.  If there are
2945          * more someone else has a ref and we cannot zap the entry.
2946          *
2947          * one for our hold
2948          * one for our parent link (parent also has one from the linkage)
2949          */
2950         if (par)
2951                 refcmp = 2;
2952         else
2953                 refcmp = 1;
2954
2955         /*
2956          * On failure undo the work we've done so far and drop the
2957          * caller's ref and ncp.
2958          */
2959         if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) {
2960                 if (par) {
2961                         spin_unlock(&nchpp->spin);
2962                         _cache_put(par);
2963                 }
2964                 _cache_unlock(ncp);
2965                 _cache_drop(ncp);
2966                 return res;
2967         }
2968
2969         /*
2970          * We own all the refs and with the spinlocks held no further
2971          * refs can be acquired by others.
2972          *
2973          * Remove us from the hash list and parent list.  We have to
2974          * drop a ref on the parent's vp if the parent's list becomes
2975          * empty.
2976          */
2977         if (par) {
2978                 KKASSERT(nchpp == ncp->nc_head);
2979                 _cache_unlink_parent(par, ncp, nchpp); /* eats nhcpp */
2980                 /*_cache_unlock(par);*/
2981                 /* &nchpp->spin is unlocked by call */
2982         } else {
2983                 KKASSERT(ncp->nc_head == NULL);
2984         }
2985
2986         /*
2987          * ncp should not have picked up any refs.  Physically
2988          * destroy the ncp.
2989          */
2990         if (ncp->nc_refs != refcmp) {
2991                 panic("cache_zap: %p bad refs %d (expected %d)\n",
2992                         ncp, ncp->nc_refs, refcmp);
2993         }
2994         /* _cache_unlock(ncp) not required */
2995         ncp->nc_refs = -1;      /* safety */
2996         if (ncp->nc_name)
2997                 kfree(ncp->nc_name, M_VFSCACHEAUX);
2998         kfree_obj(ncp, M_VFSCACHE);
2999         res = 1;
3000
3001         /*
3002          * Loop up if we can recursively clean out the parent.
3003          */
3004         if (par) {
3005                 refcmp = 1;             /* ref on parent */
3006                 if (par->nc_parent)     /* par->par */
3007                         ++refcmp;
3008                 par->nc_flag &= ~NCF_DEFEREDZAP;
3009                 if ((par->nc_flag & NCF_UNRESOLVED) &&
3010                     par->nc_refs == refcmp &&
3011                     TAILQ_EMPTY(&par->nc_list))
3012                 {
3013                         ncp = par;
3014                         goto again;
3015                 }
3016                 _cache_unlock(par);
3017                 _cache_drop(par);
3018         }
3019         return 1;
3020 }
3021
3022 /*
3023  * Clean up dangling negative cache and defered-drop entries in the
3024  * namecache.
3025  *
3026  * This routine is called in the critical path and also called from
3027  * vnlru().  When called from vnlru we use a lower limit to try to
3028  * deal with the negative cache before the critical path has to start
3029  * dealing with it.
3030  */
3031 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
3032
3033 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
3034 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
3035 static cache_hs_t exc_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
3036
3037 static int cache_hyst_run[2];
3038
3039 void
3040 cache_hysteresis(int critpath)
3041 {
3042         long poslimit;
3043         long exclimit;
3044         long neglimit;
3045         long xnumunres;
3046         long xnumleafs;
3047         long clean_neg;
3048         long clean_unres;
3049         long clean_excess;
3050
3051         /*
3052          * Lets not compete for running a general garbage collection
3053          */
3054         if (atomic_swap_int(&cache_hyst_run[critpath], 1) != 0)
3055                 return;
3056
3057         /*
3058          * Calculate negative ncp limit
3059          */
3060         neglimit = maxvnodes / ncnegfactor;
3061         if (critpath == 0)
3062                 neglimit = neglimit * 8 / 10;
3063
3064         /*
3065          * Don't cache too many negative hits.  We use hysteresis to reduce
3066          * the impact on the critical path.
3067          */
3068         clean_neg = 0;
3069
3070         switch(neg_cache_hysteresis_state[critpath]) {
3071         case CHI_LOW:
3072                 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
3073                         if (critpath)
3074                                 clean_neg = ncnegflush;
3075                         else
3076                                 clean_neg = ncnegflush +
3077                                             vfscache_negs - neglimit;
3078                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
3079                 }
3080                 break;
3081         case CHI_HIGH:
3082                 if (vfscache_negs > MINNEG * 9 / 10 &&
3083                     vfscache_negs * 9 / 10 > neglimit
3084                 ) {
3085                         if (critpath)
3086                                 clean_neg = ncnegflush;
3087                         else
3088                                 clean_neg = ncnegflush +
3089                                             vfscache_negs * 9 / 10 -
3090                                             neglimit;
3091                 } else {
3092                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
3093                 }
3094                 break;
3095         }
3096         if (clean_neg)
3097                 _cache_cleanneg(clean_neg);
3098
3099         /*
3100          * Don't cache too many unresolved elements.  We use hysteresis to
3101          * reduce the impact on the critical path.
3102          */
3103         if ((poslimit = ncposlimit) == 0)
3104                 poslimit = maxvnodes / ncposfactor;
3105         if (critpath == 0)
3106                 poslimit = poslimit * 8 / 10;
3107
3108         /*
3109          * Number of unresolved leaf elements in the namecache.  These
3110          * can build-up for various reasons and may have to be disposed
3111          * of to allow the inactive list to be cleaned out by vnlru_proc()
3112          *
3113          * Collect count
3114          */
3115         xnumunres = vfscache_unres;
3116         clean_unres = 0;
3117
3118         switch(pos_cache_hysteresis_state[critpath]) {
3119         case CHI_LOW:
3120                 if (xnumunres > poslimit && xnumunres > MINPOS) {
3121                         if (critpath)
3122                                 clean_unres = ncposflush;
3123                         else
3124                                 clean_unres = ncposflush + xnumunres -
3125                                               poslimit;
3126                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
3127                 }
3128                 break;
3129         case CHI_HIGH:
3130                 if (xnumunres > poslimit * 5 / 6 && xnumunres > MINPOS) {
3131                         if (critpath)
3132                                 clean_unres = ncposflush;
3133                         else
3134                                 clean_unres = ncposflush + xnumunres -
3135                                               poslimit * 5 / 6;
3136                 } else {
3137                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
3138                 }
3139                 break;
3140         }
3141
3142         /*
3143          * Excessive positive hits can accumulate due to large numbers of
3144          * hardlinks (the vnode cache will not prevent ncps representing
3145          * hardlinks from growing into infinity).
3146          */
3147         exclimit = maxvnodes * 2;
3148         if (critpath == 0)
3149                 exclimit = exclimit * 8 / 10;
3150         xnumleafs = vfscache_leafs;
3151         clean_excess = 0;
3152
3153         switch(exc_cache_hysteresis_state[critpath]) {
3154         case CHI_LOW:
3155                 if (xnumleafs > exclimit && xnumleafs > MINPOS) {
3156                         if (critpath)
3157                                 clean_excess = ncposflush;
3158                         else
3159                                 clean_excess = ncposflush + xnumleafs -
3160                                                exclimit;
3161                         exc_cache_hysteresis_state[critpath] = CHI_HIGH;
3162                 }
3163                 break;
3164         case CHI_HIGH:
3165                 if (xnumleafs > exclimit * 5 / 6 && xnumleafs > MINPOS) {
3166                         if (critpath)
3167                                 clean_excess = ncposflush;
3168                         else
3169                                 clean_excess = ncposflush + xnumleafs -
3170                                                exclimit * 5 / 6;
3171                 } else {
3172                         exc_cache_hysteresis_state[critpath] = CHI_LOW;
3173                 }
3174                 break;
3175         }
3176
3177         if (clean_unres || clean_excess)
3178                 _cache_cleanpos(clean_unres, clean_excess);
3179
3180         /*
3181          * Clean out dangling defered-zap ncps which could not be cleanly
3182          * dropped if too many build up.  Note that numdefered is
3183          * heuristical.  Make sure we are real-time for the current cpu,
3184          * plus the global rollup.
3185          */
3186         if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) {
3187                 _cache_cleandefered();
3188         }
3189
3190         atomic_swap_int(&cache_hyst_run[critpath], 0);
3191 }
3192
3193 /*
3194  * NEW NAMECACHE LOOKUP API
3195  *
3196  * Lookup an entry in the namecache.  The passed par_nch must be referenced
3197  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
3198  * is ALWAYS returned, eve if the supplied component is illegal.
3199  *
3200  * The resulting namecache entry should be returned to the system with
3201  * cache_put() or cache_unlock() + cache_drop().
3202  *
3203  * namecache locks are recursive but care must be taken to avoid lock order
3204  * reversals (hence why the passed par_nch must be unlocked).  Locking
3205  * rules are to order for parent traversals, not for child traversals.
3206  *
3207  * Nobody else will be able to manipulate the associated namespace (e.g.
3208  * create, delete, rename, rename-target) until the caller unlocks the
3209  * entry.
3210  *
3211  * The returned entry will be in one of three states:  positive hit (non-null
3212  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
3213  * Unresolved entries must be resolved through the filesystem to associate the
3214  * vnode and/or determine whether a positive or negative hit has occured.
3215  *
3216  * It is not necessary to lock a directory in order to lock namespace under
3217  * that directory.  In fact, it is explicitly not allowed to do that.  A
3218  * directory is typically only locked when being created, renamed, or
3219  * destroyed.
3220  *
3221  * The directory (par) may be unresolved, in which case any returned child
3222  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
3223  * the filesystem lookup requires a resolved directory vnode the caller is
3224  * responsible for resolving the namecache chain top-down.  This API
3225  * specifically allows whole chains to be created in an unresolved state.
3226  */
3227 struct nchandle
3228 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
3229 {
3230         struct nchandle nch;
3231         struct namecache *ncp;
3232         struct namecache *new_ncp;
3233         struct namecache *rep_ncp;      /* reuse a destroyed ncp */
3234         struct nchash_head *nchpp;
3235         struct mount *mp;
3236         u_int32_t hash;
3237         globaldata_t gd;
3238         int par_locked;
3239         int use_excl;
3240
3241         gd = mycpu;
3242         mp = par_nch->mount;
3243         par_locked = 0;
3244
3245         /*
3246          * This is a good time to call it, no ncp's are locked by
3247          * the caller or us.
3248          */
3249         cache_hysteresis(1);
3250
3251         /*
3252          * Try to locate an existing entry
3253          */
3254         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3255         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3256         new_ncp = NULL;
3257         use_excl = 0;
3258         nchpp = NCHHASH(hash);
3259 restart:
3260         rep_ncp = NULL;
3261         if (use_excl)
3262                 spin_lock(&nchpp->spin);
3263         else
3264                 spin_lock_shared(&nchpp->spin);
3265
3266         /*
3267          * Do a reverse scan to collect any DESTROYED ncps prior to matching
3268          * an existing entry.
3269          */
3270         TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) {
3271                 /*
3272                  * Break out if we find a matching entry.  Note that
3273                  * UNRESOLVED entries may match, but DESTROYED entries
3274                  * do not.
3275                  *
3276                  * We may be able to reuse DESTROYED entries that we come
3277                  * across, even if the name does not match, as long as
3278                  * nc_nlen is correct and the only hold ref is from the nchpp
3279                  * list itself.
3280                  */
3281                 if (ncp->nc_parent == par_nch->ncp &&
3282                     ncp->nc_nlen == nlc->nlc_namelen) {
3283                         if (ncp->nc_flag & NCF_DESTROYED) {
3284                                 if (ncp->nc_refs == 1 && rep_ncp == NULL)
3285                                         rep_ncp = ncp;
3286                                 continue;
3287                         }
3288                         if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen))
3289                                 continue;
3290
3291                         /*
3292                          * Matched ncp
3293                          */
3294                         _cache_hold(ncp);
3295                         if (rep_ncp)
3296                                 _cache_hold(rep_ncp);
3297
3298                         if (use_excl)
3299                                 spin_unlock(&nchpp->spin);
3300                         else
3301                                 spin_unlock_shared(&nchpp->spin);
3302
3303                         if (par_locked) {
3304                                 _cache_unlock(par_nch->ncp);
3305                                 par_locked = 0;
3306                         }
3307
3308                         /*
3309                          * Really try to destroy rep_ncp if encountered.
3310                          * Various edge cases can build up more than one,
3311                          * so loop if we succeed.  This isn't perfect, but
3312                          * we can't afford to have tons of entries build
3313                          * up on a single nhcpp list due to rename-over
3314                          * operations.  If that were to happen, the system
3315                          * would bog down quickly.
3316                          */
3317                         if (rep_ncp) {
3318                                 if (_cache_lock_nonblock(rep_ncp) == 0) {
3319                                         if (rep_ncp->nc_flag & NCF_DESTROYED) {
3320                                                 if (cache_zap(rep_ncp)) {
3321                                                         _cache_drop(ncp);
3322                                                         goto restart;
3323                                                 }
3324                                         } else {
3325                                                 _cache_unlock(rep_ncp);
3326                                                 _cache_drop(rep_ncp);
3327                                         }
3328                                 } else {
3329                                         _cache_drop(rep_ncp);
3330                                 }
3331                         }
3332
3333                         /*
3334                          * Continue processing the matched entry
3335                          */
3336                         if (_cache_lock_special(ncp) == 0) {
3337                                 /*
3338                                  * Successfully locked but we must re-test
3339                                  * conditions that might have changed since
3340                                  * we did not have the lock before.
3341                                  */
3342                                 if (ncp->nc_parent != par_nch->ncp ||
3343                                     ncp->nc_nlen != nlc->nlc_namelen ||
3344                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3345                                          ncp->nc_nlen) ||
3346                                     (ncp->nc_flag & NCF_DESTROYED)) {
3347                                         _cache_put(ncp);
3348                                         goto restart;
3349                                 }
3350                                 _cache_auto_unresolve(mp, ncp);
3351                                 if (new_ncp) {
3352                                         _cache_free(new_ncp);
3353                                         new_ncp = NULL; /* safety */
3354                                 }
3355                                 goto found;
3356                         }
3357                         _cache_get(ncp);        /* cycle the lock to block */
3358                         _cache_put(ncp);
3359                         _cache_drop(ncp);
3360                         goto restart;
3361                 }
3362         }
3363
3364         /*
3365          * We failed to locate the entry, try to resurrect a destroyed
3366          * entry that we did find that is already correctly linked into
3367          * nchpp and the parent.  We must re-test conditions after
3368          * successfully locking rep_ncp.
3369          *
3370          * This case can occur under heavy loads due to not being able
3371          * to safely lock the parent in cache_zap().  Nominally a repeated
3372          * create/unlink load, but only the namelen needs to match.
3373          *
3374          * An exclusive lock on the nchpp is required to process this case,
3375          * otherwise a race can cause duplicate entries to be created with
3376          * one cpu reusing a DESTROYED ncp while another creates a new_ncp.
3377          */
3378         if (rep_ncp && use_excl) {
3379                 if (_cache_lock_nonblock(rep_ncp) == 0) {
3380                         _cache_hold(rep_ncp);
3381                         if (rep_ncp->nc_parent == par_nch->ncp &&
3382                             rep_ncp->nc_nlen == nlc->nlc_namelen &&
3383                             (rep_ncp->nc_flag & NCF_DESTROYED) &&
3384                             rep_ncp->nc_refs == 2)
3385                         {
3386                                 /*
3387                                  * Update nc_name.
3388                                  */
3389                                 ncp = rep_ncp;
3390
3391                                 _cache_ncp_gen_enter(ncp);
3392
3393                                 bcopy(nlc->nlc_nameptr, ncp->nc_name,
3394                                       nlc->nlc_namelen);
3395
3396                                 /*
3397                                  * This takes some care.  We must clear the
3398                                  * NCF_DESTROYED flag before unlocking the
3399                                  * hash chain so other concurrent searches
3400                                  * do not skip this element.
3401                                  *
3402                                  * We must also unlock the hash chain before
3403                                  * unresolving the ncp to avoid deadlocks.
3404                                  * We hold the lock on the ncp so we can safely
3405                                  * reinitialize nc_flag after that.
3406                                  */
3407                                 ncp->nc_flag &= ~NCF_DESTROYED;
3408                                 spin_unlock(&nchpp->spin);      /* use_excl */
3409
3410                                 _cache_setunresolved(ncp, 0);
3411                                 ncp->nc_flag = NCF_UNRESOLVED;
3412                                 ncp->nc_error = ENOTCONN;
3413
3414                                 _cache_ncp_gen_exit(ncp);
3415
3416                                 if (par_locked) {
3417                                         _cache_unlock(par_nch->ncp);
3418                                         par_locked = 0;
3419                                 }
3420                                 if (new_ncp) {
3421                                         _cache_free(new_ncp);
3422                                         new_ncp = NULL; /* safety */
3423                                 }
3424                                 goto found;
3425                         }
3426                         _cache_put(rep_ncp);
3427                 }
3428         }
3429
3430         /*
3431          * Otherwise create a new entry and add it to the cache.  The parent
3432          * ncp must also be locked so we can link into it.
3433          *
3434          * We have to relookup after possibly blocking in kmalloc or
3435          * when locking par_nch.
3436          *
3437          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3438          *       mount case, in which case nc_name will be NULL.
3439          *
3440          * NOTE: In the rep_ncp != NULL case we are trying to reuse
3441          *       a DESTROYED entry, but didn't have an exclusive lock.
3442          *       In this situation we do not create a new_ncp.
3443          */
3444         if (new_ncp == NULL) {
3445                 if (use_excl)
3446                         spin_unlock(&nchpp->spin);
3447                 else
3448                         spin_unlock_shared(&nchpp->spin);
3449                 if (rep_ncp == NULL) {
3450                         new_ncp = cache_alloc(nlc->nlc_namelen);
3451                         if (nlc->nlc_namelen) {
3452                                 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3453                                       nlc->nlc_namelen);
3454                                 new_ncp->nc_name[nlc->nlc_namelen] = 0;
3455                         }
3456                 }
3457                 use_excl = 1;
3458                 goto restart;
3459         }
3460
3461         /*
3462          * NOTE! The spinlock is held exclusively here because new_ncp
3463          *       is non-NULL.
3464          */
3465         if (par_locked == 0) {
3466                 spin_unlock(&nchpp->spin);
3467                 _cache_lock(par_nch->ncp);
3468                 par_locked = 1;
3469                 goto restart;
3470         }
3471
3472         /*
3473          * Link to parent (requires another ref, the one already in new_ncp
3474          * is what we wil lreturn).
3475          *
3476          * WARNING!  We still hold the spinlock.  We have to set the hash
3477          *           table entry atomically.
3478          */
3479         ncp = new_ncp;
3480         ++ncp->nc_refs;
3481         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3482         spin_unlock(&nchpp->spin);
3483         _cache_unlock(par_nch->ncp);
3484         /* par_locked = 0 - not used */
3485 found:
3486         /*
3487          * stats and namecache size management
3488          */
3489         if (ncp->nc_flag & NCF_UNRESOLVED)
3490                 ++gd->gd_nchstats->ncs_miss;
3491         else if (ncp->nc_vp)
3492                 ++gd->gd_nchstats->ncs_goodhits;
3493         else
3494                 ++gd->gd_nchstats->ncs_neghits;
3495         nch.mount = mp;
3496         nch.ncp = ncp;
3497         _cache_mntref(nch.mount);
3498
3499         return(nch);
3500 }
3501
3502 /*
3503  * Attempt to lookup a namecache entry and return with a shared namecache
3504  * lock.  This operates non-blocking.  EWOULDBLOCK is returned if excl is
3505  * set or we are unable to lock.
3506  */
3507 int
3508 cache_nlookup_maybe_shared(struct nchandle *par_nch,
3509                            struct nlcomponent *nlc,
3510                            int excl, struct nchandle *res_nch)
3511 {
3512         struct namecache *ncp;
3513         struct nchash_head *nchpp;
3514         struct mount *mp;
3515         u_int32_t hash;
3516         globaldata_t gd;
3517
3518         /*
3519          * If exclusive requested or shared namecache locks are disabled,
3520          * return failure.
3521          */
3522         if (ncp_shared_lock_disable || excl)
3523                 return(EWOULDBLOCK);
3524
3525         gd = mycpu;
3526         mp = par_nch->mount;
3527
3528         /*
3529          * This is a good time to call it, no ncp's are locked by
3530          * the caller or us.
3531          */
3532         cache_hysteresis(1);
3533
3534         /*
3535          * Try to locate an existing entry
3536          */
3537         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3538         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3539         nchpp = NCHHASH(hash);
3540
3541         spin_lock_shared(&nchpp->spin);
3542
3543         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3544                 /*
3545                  * Break out if we find a matching entry.  Note that
3546                  * UNRESOLVED entries may match, but DESTROYED entries
3547                  * do not.
3548                  */
3549                 if (ncp->nc_parent == par_nch->ncp &&
3550                     ncp->nc_nlen == nlc->nlc_namelen &&
3551                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3552                     (ncp->nc_flag & NCF_DESTROYED) == 0
3553                 ) {
3554                         _cache_hold(ncp);
3555                         spin_unlock_shared(&nchpp->spin);
3556
3557                         if (_cache_lock_shared_special(ncp) == 0) {
3558                                 if (ncp->nc_parent == par_nch->ncp &&
3559                                     ncp->nc_nlen == nlc->nlc_namelen &&
3560                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3561                                          ncp->nc_nlen) == 0 &&
3562                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3563                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3564                                     _cache_auto_unresolve_test(mp, ncp) == 0)
3565                                 {
3566                                         goto found;
3567                                 }
3568                                 _cache_unlock(ncp);
3569                         }
3570                         _cache_drop(ncp);
3571                         return(EWOULDBLOCK);
3572                 }
3573         }
3574
3575         /*
3576          * Failure
3577          */
3578         spin_unlock_shared(&nchpp->spin);
3579         return(EWOULDBLOCK);
3580
3581         /*
3582          * Success
3583          *
3584          * Note that nc_error might be non-zero (e.g ENOENT).
3585          */
3586 found:
3587         res_nch->mount = mp;
3588         res_nch->ncp = ncp;
3589         ++gd->gd_nchstats->ncs_goodhits;
3590         _cache_mntref(res_nch->mount);
3591
3592         KKASSERT(ncp->nc_error != EWOULDBLOCK);
3593         return(ncp->nc_error);
3594 }
3595
3596 /*
3597  * This is a non-blocking verison of cache_nlookup() used by
3598  * nfs_readdirplusrpc_uio().  It can fail for any reason and
3599  * will return nch.ncp == NULL in that case.
3600  */
3601 struct nchandle
3602 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3603 {
3604         struct nchandle nch;
3605         struct namecache *ncp;
3606         struct namecache *new_ncp;
3607         struct nchash_head *nchpp;
3608         struct mount *mp;
3609         u_int32_t hash;
3610         globaldata_t gd;
3611         int par_locked;
3612
3613         gd = mycpu;
3614         mp = par_nch->mount;
3615         par_locked = 0;
3616
3617         /*
3618          * Try to locate an existing entry
3619          */
3620         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3621         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3622         new_ncp = NULL;
3623         nchpp = NCHHASH(hash);
3624 restart:
3625         spin_lock(&nchpp->spin);
3626         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3627                 /*
3628                  * Break out if we find a matching entry.  Note that
3629                  * UNRESOLVED entries may match, but DESTROYED entries
3630                  * do not.
3631                  */
3632                 if (ncp->nc_parent == par_nch->ncp &&
3633                     ncp->nc_nlen == nlc->nlc_namelen &&
3634                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3635                     (ncp->nc_flag & NCF_DESTROYED) == 0
3636                 ) {
3637                         _cache_hold(ncp);
3638                         spin_unlock(&nchpp->spin);
3639                         if (par_locked) {
3640                                 _cache_unlock(par_nch->ncp);
3641                                 par_locked = 0;
3642                         }
3643                         if (_cache_lock_special(ncp) == 0) {
3644                                 if (ncp->nc_parent != par_nch->ncp ||
3645                                     ncp->nc_nlen != nlc->nlc_namelen ||
3646                                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3647                                     (ncp->nc_flag & NCF_DESTROYED)) {
3648                                         kprintf("cache_lookup_nonblock: "
3649                                                 "ncp-race %p %*.*s\n",
3650                                                 ncp,
3651                                                 nlc->nlc_namelen,
3652                                                 nlc->nlc_namelen,
3653                                                 nlc->nlc_nameptr);
3654                                         _cache_unlock(ncp);
3655                                         _cache_drop(ncp);
3656                                         goto failed;
3657                                 }
3658                                 _cache_auto_unresolve(mp, ncp);
3659                                 if (new_ncp) {
3660                                         _cache_free(new_ncp);
3661                                         new_ncp = NULL;
3662                                 }
3663                                 goto found;
3664                         }
3665                         _cache_drop(ncp);
3666                         goto failed;
3667                 }
3668         }
3669
3670         /*
3671          * We failed to locate an entry, create a new entry and add it to
3672          * the cache.  The parent ncp must also be locked so we
3673          * can link into it.
3674          *
3675          * We have to relookup after possibly blocking in kmalloc or
3676          * when locking par_nch.
3677          *
3678          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3679          *       mount case, in which case nc_name will be NULL.
3680          */
3681         if (new_ncp == NULL) {
3682                 spin_unlock(&nchpp->spin);
3683                 new_ncp = cache_alloc(nlc->nlc_namelen);
3684                 if (nlc->nlc_namelen) {
3685                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3686                               nlc->nlc_namelen);
3687                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3688                 }
3689                 goto restart;
3690         }
3691         if (par_locked == 0) {
3692                 spin_unlock(&nchpp->spin);
3693                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3694                         par_locked = 1;
3695                         goto restart;
3696                 }
3697                 goto failed;
3698         }
3699
3700         /*
3701          * Link to parent (requires another ref, the one already in new_ncp
3702          * is what we wil lreturn).
3703          *
3704          * WARNING!  We still hold the spinlock.  We have to set the hash
3705          *           table entry atomically.
3706          */
3707         ncp = new_ncp;
3708         ++ncp->nc_refs;
3709         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3710         spin_unlock(&nchpp->spin);
3711         _cache_unlock(par_nch->ncp);
3712         /* par_locked = 0 - not used */
3713 found:
3714         /*
3715          * stats and namecache size management
3716          */
3717         if (ncp->nc_flag & NCF_UNRESOLVED)
3718                 ++gd->gd_nchstats->ncs_miss;
3719         else if (ncp->nc_vp)
3720                 ++gd->gd_nchstats->ncs_goodhits;
3721         else
3722                 ++gd->gd_nchstats->ncs_neghits;
3723         nch.mount = mp;
3724         nch.ncp = ncp;
3725         _cache_mntref(nch.mount);
3726
3727         return(nch);
3728 failed:
3729         if (new_ncp) {
3730                 _cache_free(new_ncp);
3731                 new_ncp = NULL;
3732         }
3733         nch.mount = NULL;
3734         nch.ncp = NULL;
3735         return(nch);
3736 }
3737
3738 /*
3739  * This is a non-locking optimized lookup that depends on adding a ref
3740  * to prevent normal eviction.  nch.ncp can be returned as NULL for any
3741  * reason and the caller will retry with normal locking in that case.
3742  *
3743  * This function only returns resolved entries so callers do not accidentally
3744  * race doing out of order / unfenced field checks.
3745  *
3746  * The caller must validate the result for parent-to-child continuity.
3747  */
3748 struct nchandle
3749 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc)
3750 {
3751         struct nchandle nch;
3752         struct namecache *ncp;
3753         struct nchash_head *nchpp;
3754         struct mount *mp;
3755         u_int32_t hash;
3756         globaldata_t gd;
3757
3758         gd = mycpu;
3759         mp = par_nch->mount;
3760
3761         /*
3762          * Try to locate an existing entry
3763          */
3764         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3765         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3766         nchpp = NCHHASH(hash);
3767
3768         spin_lock_shared(&nchpp->spin);
3769         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3770                 /*
3771                  * Break out if we find a matching entry.  Note that
3772                  * UNRESOLVED entries may match, but DESTROYED entries
3773                  * do not.  However, UNRESOLVED entries still return failure.
3774                  */
3775                 if (ncp->nc_parent == par_nch->ncp &&
3776                     ncp->nc_nlen == nlc->nlc_namelen &&
3777                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3778                     (ncp->nc_flag & NCF_DESTROYED) == 0
3779                 ) {
3780                         /*
3781                          * Test NFS timeout for auto-unresolve.  Give up if
3782                          * the entry is not resolved.
3783                          *
3784                          * Getting the ref with the nchpp locked prevents
3785                          * any transition to NCF_DESTROYED.
3786                          */
3787                         if (_cache_auto_unresolve_test(par_nch->mount, ncp))
3788                                 break;
3789                         if (ncp->nc_flag & NCF_UNRESOLVED)
3790                                 break;
3791                         _cache_hold(ncp);
3792                         spin_unlock_shared(&nchpp->spin);
3793
3794                         /*
3795                          * We need an additional test to ensure that the ref
3796                          * we got above prevents transitions to NCF_UNRESOLVED.
3797                          * This can occur if another thread is currently
3798                          * holding the ncp exclusively locked or (if we raced
3799                          * that and it unlocked before our test) the flag
3800                          * has been set.
3801                          *
3802                          * XXX check if superceeded by nc_generation XXX
3803                          */
3804                         if (_cache_lockstatus(ncp) < 0 ||
3805                             (ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)))
3806                         {
3807                                 if ((ncvp_debug & 4) &&
3808                                     (ncp->nc_flag &
3809                                      (NCF_DESTROYED | NCF_UNRESOLVED)))
3810                                 {
3811                                     kprintf("ncp state change: %p %08x %d %s\n",
3812                                             ncp, ncp->nc_flag, ncp->nc_error,
3813                                             ncp->nc_name);
3814                                 }
3815                                 _cache_drop(ncp);
3816                                 spin_lock_shared(&nchpp->spin);
3817                                 break;
3818                         }
3819
3820                         /*
3821                          * Return the ncp bundled into a nch on success.
3822                          * The ref should passively prevent the ncp from
3823                          * becoming unresolved without having to hold a lock.
3824                          * (XXX this may not be entirely true)
3825                          */
3826                         goto found;
3827                 }
3828         }
3829         spin_unlock_shared(&nchpp->spin);
3830         nch.mount = NULL;
3831         nch.ncp = NULL;
3832
3833         return nch;
3834 found:
3835         /*
3836          * stats and namecache size management
3837          */
3838         if (ncp->nc_flag & NCF_UNRESOLVED)
3839                 ++gd->gd_nchstats->ncs_miss;
3840         else if (ncp->nc_vp)
3841                 ++gd->gd_nchstats->ncs_goodhits;
3842         else
3843                 ++gd->gd_nchstats->ncs_neghits;
3844         nch.mount = mp;
3845         nch.ncp = ncp;
3846         _cache_mntref(nch.mount);
3847
3848         return(nch);
3849 }
3850
3851 /*
3852  * The namecache entry is marked as being used as a mount point.
3853  * Locate the mount if it is visible to the caller.  The DragonFly
3854  * mount system allows arbitrary loops in the topology and disentangles
3855  * those loops by matching against (mp, ncp) rather than just (ncp).
3856  * This means any given ncp can dive any number of mounts, depending
3857  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3858  *
3859  * We use a very simple frontend cache to reduce SMP conflicts,
3860  * which we have to do because the mountlist scan needs an exclusive
3861  * lock around its ripout info list.  Not to mention that there might
3862  * be a lot of mounts.
3863  *
3864  * Because all mounts can potentially be accessed by all cpus, break the cpu's
3865  * down a bit to allow some contention rather than making the cache
3866  * excessively huge.
3867  *
3868  * The hash table is split into per-cpu areas, is 4-way set-associative.
3869  */
3870 struct findmount_info {
3871         struct mount *result;
3872         struct mount *nch_mount;
3873         struct namecache *nch_ncp;
3874 };
3875
3876 static __inline
3877 struct ncmount_cache *
3878 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp)
3879 {
3880         uint32_t hash;
3881
3882         hash = iscsi_crc32(&mp, sizeof(mp));
3883         hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash);
3884         hash ^= hash >> 16;
3885         hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1));
3886
3887         return (&ncmount_cache[hash]);
3888 }
3889
3890 static
3891 struct ncmount_cache *
3892 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3893 {
3894         struct ncmount_cache *ncc;
3895         struct ncmount_cache *best;
3896         int delta;
3897         int best_delta;
3898         int i;
3899
3900         ncc = ncmount_cache_lookup4(mp, ncp);
3901
3902         /*
3903          * NOTE: When checking for a ticks overflow implement a slop of
3904          *       2 ticks just to be safe, because ticks is accessed
3905          *       non-atomically one CPU can increment it while another
3906          *       is still using the old value.
3907          */
3908         if (ncc->ncp == ncp && ncc->mp == mp)   /* 0 */
3909                 return ncc;
3910         delta = (int)(ticks - ncc->ticks);      /* beware GCC opts */
3911         if (delta < -2)                         /* overflow reset */
3912                 ncc->ticks = ticks;
3913         best = ncc;
3914         best_delta = delta;
3915
3916         for (i = 1; i < NCMOUNT_SET; ++i) {     /* 1, 2, 3 */
3917                 ++ncc;
3918                 if (ncc->ncp == ncp && ncc->mp == mp)
3919                         return ncc;
3920                 delta = (int)(ticks - ncc->ticks);
3921                 if (delta < -2)
3922                         ncc->ticks = ticks;
3923                 if (delta > best_delta) {
3924                         best_delta = delta;
3925                         best = ncc;
3926                 }
3927         }
3928         return best;
3929 }
3930
3931 /*
3932  * pcpu-optimized mount search.  Locate the recursive mountpoint, avoid
3933  * doing an expensive mountlist_scan*() if possible.
3934  *
3935  * (mp, ncp) -> mountonpt.k
3936  *
3937  * Returns a referenced mount pointer or NULL
3938  *
3939  * General SMP operation uses a per-cpu umount_spin to interlock unmount
3940  * operations (that is, where the mp_target can be freed out from under us).
3941  *
3942  * Lookups use the ncc->updating counter to validate the contents in order
3943  * to avoid having to obtain the per cache-element spin-lock.  In addition,
3944  * the ticks field is only updated when it changes.  However, if our per-cpu
3945  * lock fails due to an unmount-in-progress, we fall-back to the
3946  * cache-element's spin-lock.
3947  */
3948 struct mount *
3949 cache_findmount(struct nchandle *nch)
3950 {
3951         struct findmount_info info;
3952         struct ncmount_cache *ncc;
3953         struct ncmount_cache ncc_copy;
3954         struct mount *target;
3955         struct pcpu_ncache *pcpu;
3956         struct spinlock *spinlk;
3957         int update;
3958
3959         pcpu = pcpu_ncache;
3960         if (ncmount_cache_enable == 0 || pcpu == NULL) {
3961                 ncc = NULL;
3962                 goto skip;
3963         }
3964         pcpu += mycpu->gd_cpuid;
3965
3966 again:
3967         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3968         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3969 found:
3970                 /*
3971                  * This is a bit messy for now because we do not yet have
3972                  * safe disposal of mount structures.  We have to ref
3973                  * ncc->mp_target but the 'update' counter only tell us
3974                  * whether the cache has changed after the fact.
3975                  *
3976                  * For now get a per-cpu spinlock that will only contend
3977                  * against umount's.  This is the best path.  If it fails,
3978                  * instead of waiting on the umount we fall-back to a
3979                  * shared ncc->spin lock, which will generally only cost a
3980                  * cache ping-pong.
3981                  */
3982                 update = ncc->updating;
3983                 if (__predict_true(spin_trylock(&pcpu->umount_spin))) {
3984                         spinlk = &pcpu->umount_spin;
3985                 } else {
3986                         spinlk = &ncc->spin;
3987                         spin_lock_shared(spinlk);
3988                 }
3989                 if (update & 1) {               /* update in progress */
3990                         spin_unlock_any(spinlk);
3991                         goto skip;
3992                 }
3993                 ncc_copy = *ncc;
3994                 cpu_lfence();
3995                 if (ncc->updating != update) {  /* content changed */
3996                         spin_unlock_any(spinlk);
3997                         goto again;
3998                 }
3999                 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) {
4000                         spin_unlock_any(spinlk);
4001                         goto again;
4002                 }
4003                 if (ncc_copy.isneg == 0) {
4004                         target = ncc_copy.mp_target;
4005                         if (target->mnt_ncmounton.mount == nch->mount &&
4006                             target->mnt_ncmounton.ncp == nch->ncp) {
4007                                 /*
4008                                  * Cache hit (positive) (avoid dirtying
4009                                  * the cache line if possible)
4010                                  */
4011                                 if (ncc->ticks != (int)ticks)
4012                                         ncc->ticks = (int)ticks;
4013                                 _cache_mntref(target);
4014                         }
4015                 } else {
4016                         /*
4017                          * Cache hit (negative) (avoid dirtying
4018                          * the cache line if possible)
4019                          */
4020                         if (ncc->ticks != (int)ticks)
4021                                 ncc->ticks = (int)ticks;
4022                         target = NULL;
4023                 }
4024                 spin_unlock_any(spinlk);
4025
4026                 return target;
4027         }
4028 skip:
4029
4030         /*
4031          * Slow
4032          */
4033         info.result = NULL;
4034         info.nch_mount = nch->mount;
4035         info.nch_ncp = nch->ncp;
4036         mountlist_scan(cache_findmount_callback, &info,
4037                        MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK);
4038
4039         /*
4040          * To reduce multi-re-entry on the cache, relookup in the cache.
4041          * This can still race, obviously, but that's ok.
4042          */
4043         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
4044         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
4045                 if (info.result)
4046                         atomic_add_int(&info.result->mnt_refs, -1);
4047                 goto found;
4048         }
4049
4050         /*
4051          * Cache the result.
4052          */
4053         if ((info.result == NULL ||
4054             (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) {
4055                 spin_lock(&ncc->spin);
4056                 atomic_add_int_nonlocked(&ncc->updating, 1);
4057                 cpu_sfence();
4058                 KKASSERT(ncc->updating & 1);
4059                 if (ncc->mp != nch->mount) {
4060                         if (ncc->mp)
4061                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
4062                         atomic_add_int(&nch->mount->mnt_refs, 1);
4063                         ncc->mp = nch->mount;
4064                 }
4065                 ncc->ncp = nch->ncp;    /* ptr compares only, not refd*/
4066                 ncc->ticks = (int)ticks;
4067
4068                 if (info.result) {
4069                         ncc->isneg = 0;
4070                         if (ncc->mp_target != info.result) {
4071                                 if (ncc->mp_target)
4072                                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
4073                                 ncc->mp_target = info.result;
4074                                 atomic_add_int(&info.result->mnt_refs, 1);
4075                         }
4076                 } else {
4077                         ncc->isneg = 1;
4078                         if (ncc->mp_target) {
4079                                 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
4080                                 ncc->mp_target = NULL;
4081                         }
4082                 }
4083                 cpu_sfence();
4084                 atomic_add_int_nonlocked(&ncc->updating, 1);
4085                 spin_unlock(&ncc->spin);
4086         }
4087         return(info.result);
4088 }
4089
4090 static
4091 int
4092 cache_findmount_callback(struct mount *mp, void *data)
4093 {
4094         struct findmount_info *info = data;
4095
4096         /*
4097          * Check the mount's mounted-on point against the passed nch.
4098          */
4099         if (mp->mnt_ncmounton.mount == info->nch_mount &&
4100             mp->mnt_ncmounton.ncp == info->nch_ncp
4101         ) {
4102             info->result = mp;
4103             _cache_mntref(mp);
4104             return(-1);
4105         }
4106         return(0);
4107 }
4108
4109 void
4110 cache_dropmount(struct mount *mp)
4111 {
4112         _cache_mntrel(mp);
4113 }
4114
4115 /*
4116  * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
4117  * or negative).
4118  *
4119  * A full scan is not required, but for now just do it anyway.
4120  */
4121 void
4122 cache_ismounting(struct mount *mp)
4123 {
4124         struct ncmount_cache *ncc;
4125         struct mount *ncc_mp;
4126         int i;
4127
4128         if (pcpu_ncache == NULL)
4129                 return;
4130
4131         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
4132                 ncc = &ncmount_cache[i];
4133                 if (ncc->mp != mp->mnt_ncmounton.mount ||
4134                     ncc->ncp != mp->mnt_ncmounton.ncp) {
4135                         continue;
4136                 }
4137                 spin_lock(&ncc->spin);
4138                 atomic_add_int_nonlocked(&ncc->updating, 1);
4139                 cpu_sfence();
4140                 KKASSERT(ncc->updating & 1);
4141                 if (ncc->mp != mp->mnt_ncmounton.mount ||
4142                     ncc->ncp != mp->mnt_ncmounton.ncp) {
4143                         cpu_sfence();
4144                         ++ncc->updating;
4145                         spin_unlock(&ncc->spin);
4146                         continue;
4147                 }
4148                 ncc_mp = ncc->mp;
4149                 ncc->ncp = NULL;
4150                 ncc->mp = NULL;
4151                 if (ncc_mp)
4152                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4153                 ncc_mp = ncc->mp_target;
4154                 ncc->mp_target = NULL;
4155                 if (ncc_mp)
4156                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4157                 ncc->ticks = (int)ticks - hz * 120;
4158
4159                 cpu_sfence();
4160                 atomic_add_int_nonlocked(&ncc->updating, 1);
4161                 spin_unlock(&ncc->spin);
4162         }
4163
4164         /*
4165          * Pre-cache the mount point
4166          */
4167         ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount,
4168                                    mp->mnt_ncmounton.ncp);
4169
4170         spin_lock(&ncc->spin);
4171         atomic_add_int_nonlocked(&ncc->updating, 1);
4172         cpu_sfence();
4173         KKASSERT(ncc->updating & 1);
4174
4175         if (ncc->mp)
4176                 atomic_add_int(&ncc->mp->mnt_refs, -1);
4177         atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1);
4178         ncc->mp = mp->mnt_ncmounton.mount;
4179         ncc->ncp = mp->mnt_ncmounton.ncp;       /* ptr compares only */
4180         ncc->ticks = (int)ticks;
4181
4182         ncc->isneg = 0;
4183         if (ncc->mp_target != mp) {
4184                 if (ncc->mp_target)
4185                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
4186                 ncc->mp_target = mp;
4187                 atomic_add_int(&mp->mnt_refs, 1);
4188         }
4189         cpu_sfence();
4190         atomic_add_int_nonlocked(&ncc->updating, 1);
4191         spin_unlock(&ncc->spin);
4192 }
4193
4194 /*
4195  * Scrap any ncmount_cache entries related to mp.  Not only do we need to
4196  * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
4197  * negative hits involving (mp, <any>).
4198  *
4199  * A full scan is required.
4200  */
4201 void
4202 cache_unmounting(struct mount *mp)
4203 {
4204         struct ncmount_cache *ncc;
4205         struct pcpu_ncache *pcpu;
4206         struct mount *ncc_mp;
4207         int i;
4208
4209         pcpu = pcpu_ncache;
4210         if (pcpu == NULL)
4211                 return;
4212
4213         for (i = 0; i < ncpus; ++i)
4214                 spin_lock(&pcpu[i].umount_spin);
4215
4216         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
4217                 ncc = &ncmount_cache[i];
4218                 if (ncc->mp != mp && ncc->mp_target != mp)
4219                         continue;
4220                 spin_lock(&ncc->spin);
4221                 atomic_add_int_nonlocked(&ncc->updating, 1);
4222                 cpu_sfence();
4223
4224                 if (ncc->mp != mp && ncc->mp_target != mp) {
4225                         atomic_add_int_nonlocked(&ncc->updating, 1);
4226                         cpu_sfence();
4227                         spin_unlock(&ncc->spin);
4228                         continue;
4229                 }
4230                 ncc_mp = ncc->mp;
4231                 ncc->ncp = NULL;
4232                 ncc->mp = NULL;
4233                 if (ncc_mp)
4234                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4235                 ncc_mp = ncc->mp_target;
4236                 ncc->mp_target = NULL;
4237                 if (ncc_mp)
4238                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4239                 ncc->ticks = (int)ticks - hz * 120;
4240
4241                 cpu_sfence();
4242                 atomic_add_int_nonlocked(&ncc->updating, 1);
4243                 spin_unlock(&ncc->spin);
4244         }
4245
4246         for (i = 0; i < ncpus; ++i)
4247                 spin_unlock(&pcpu[i].umount_spin);
4248 }
4249
4250 /*
4251  * Resolve an unresolved namecache entry, generally by looking it up.
4252  * The passed ncp must be locked and refd.
4253  *
4254  * Theoretically since a vnode cannot be recycled while held, and since
4255  * the nc_parent chain holds its vnode as long as children exist, the
4256  * direct parent of the cache entry we are trying to resolve should
4257  * have a valid vnode.  If not then generate an error that we can
4258  * determine is related to a resolver bug.
4259  *
4260  * However, if a vnode was in the middle of a recyclement when the NCP
4261  * got locked, ncp->nc_vp might point to a vnode that is about to become
4262  * invalid.  cache_resolve() handles this case by unresolving the entry
4263  * and then re-resolving it.
4264  *
4265  * Note that successful resolution does not necessarily return an error
4266  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
4267  * will be returned.
4268  *
4269  * (*genp) is adjusted based on our resolution operation.  If it is already
4270  * wrong, that's ok... it will still be wrong on return.
4271  */
4272 int
4273 cache_resolve(struct nchandle *nch, u_int *genp, struct ucred *cred)
4274 {
4275         struct namecache *par_tmp;
4276         struct namecache *par;
4277         struct namecache *ncp;
4278         struct nchandle nctmp;
4279         struct mount *mp;
4280         struct vnode *dvp;
4281         int error;
4282
4283         ncp = nch->ncp;
4284         mp = nch->mount;
4285         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
4286
4287 restart:
4288         /*
4289          * If the ncp is already resolved we have nothing to do.  However,
4290          * we do want to guarentee that a usable vnode is returned when
4291          * a vnode is present, so make sure it hasn't been reclaimed.
4292          */
4293         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4294                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
4295                         _cache_ncp_gen_enter(ncp);
4296                         _cache_setunresolved(ncp, 0);
4297                         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4298                                 _cache_ncp_gen_exit(ncp);
4299                                 *genp += 4;
4300                                 return (ncp->nc_error);
4301                         }
4302                 } else if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4303                         return (ncp->nc_error);
4304                 } else {
4305                         _cache_ncp_gen_enter(ncp);
4306                 }
4307         } else {
4308                 _cache_ncp_gen_enter(ncp);
4309         }
4310         /* in gen_enter state */
4311         *genp += 4;
4312
4313         /*
4314          * If the ncp was destroyed it will never resolve again.  This
4315          * can basically only happen when someone is chdir'd into an
4316          * empty directory which is then rmdir'd.  We want to catch this
4317          * here and not dive the VFS because the VFS might actually
4318          * have a way to re-resolve the disconnected ncp, which will
4319          * result in inconsistencies in the cdir/nch for proc->p_fd.
4320          */
4321         if (ncp->nc_flag & NCF_DESTROYED) {
4322                 _cache_ncp_gen_exit(ncp);
4323                 return(EINVAL);
4324         }
4325
4326         /*
4327          * Mount points need special handling because the parent does not
4328          * belong to the same filesystem as the ncp.
4329          */
4330         if (ncp == mp->mnt_ncmountpt.ncp) {
4331                 error = cache_resolve_mp(mp, 0);
4332                 _cache_ncp_gen_exit(ncp);
4333                 return error;
4334         }
4335
4336         /*
4337          * We expect an unbroken chain of ncps to at least the mount point,
4338          * and even all the way to root (but this code doesn't have to go
4339          * past the mount point).
4340          */
4341         if (ncp->nc_parent == NULL) {
4342                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
4343                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
4344                 ncp->nc_error = EXDEV;
4345                 _cache_ncp_gen_exit(ncp);
4346                 return(ncp->nc_error);
4347         }
4348
4349         /*
4350          * The vp's of the parent directories in the chain are held via vhold()
4351          * due to the existance of the child, and should not disappear.
4352          * However, there are cases where they can disappear:
4353          *
4354          *      - due to filesystem I/O errors.
4355          *      - due to NFS being stupid about tracking the namespace and
4356          *        destroys the namespace for entire directories quite often.
4357          *      - due to forced unmounts.
4358          *      - due to an rmdir (parent will be marked DESTROYED)
4359          *
4360          * When this occurs we have to track the chain backwards and resolve
4361          * it, looping until the resolver catches up to the current node.  We
4362          * could recurse here but we might run ourselves out of kernel stack
4363          * so we do it in a more painful manner.  This situation really should
4364          * not occur all that often, or if it does not have to go back too
4365          * many nodes to resolve the ncp.
4366          */
4367         while ((dvp = cache_dvpref(ncp)) == NULL) {
4368                 /*
4369                  * This case can occur if a process is CD'd into a
4370                  * directory which is then rmdir'd.  If the parent is marked
4371                  * destroyed there is no point trying to resolve it.
4372                  */
4373                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) {
4374                         if (ncvp_debug & 8) {
4375                                 kprintf("nc_parent destroyed: %s/%s\n",
4376                                         ncp->nc_parent->nc_name, ncp->nc_name);
4377                         }
4378                         _cache_ncp_gen_exit(ncp);
4379                         return(ENOENT);
4380                 }
4381                 par = ncp->nc_parent;
4382                 _cache_hold(par);
4383                 _cache_lock(par);
4384                 while ((par_tmp = par->nc_parent) != NULL &&
4385                        par_tmp->nc_vp == NULL) {
4386                         _cache_hold(par_tmp);
4387                         _cache_lock(par_tmp);
4388                         _cache_put(par);
4389                         par = par_tmp;
4390                 }
4391                 if (par->nc_parent == NULL) {
4392                         kprintf("EXDEV case 2 %*.*s\n",
4393                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4394                         _cache_put(par);
4395                         _cache_ncp_gen_exit(ncp);
4396                         return (EXDEV);
4397                 }
4398                 /*
4399                  * The parent is not set in stone, ref and lock it to prevent
4400                  * it from disappearing.  Also note that due to renames it
4401                  * is possible for our ncp to move and for par to no longer
4402                  * be one of its parents.  We resolve it anyway, the loop
4403                  * will handle any moves.
4404                  */
4405                 _cache_get(par);        /* additional hold/lock */
4406                 _cache_put(par);        /* from earlier hold/lock */
4407                 if (par == nch->mount->mnt_ncmountpt.ncp) {
4408                         cache_resolve_mp(nch->mount, 0);
4409                 } else if ((dvp = cache_dvpref(par)) == NULL) {
4410                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
4411                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4412                         _cache_put(par);
4413                         continue;
4414                 } else {
4415                         if (par->nc_flag & NCF_UNRESOLVED) {
4416                                 nctmp.mount = mp;
4417                                 nctmp.ncp = par;
4418                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
4419                         }
4420                         vrele(dvp);
4421                 }
4422                 if ((error = par->nc_error) != 0) {
4423                         if (par->nc_error != EAGAIN) {
4424                                 kprintf("EXDEV case 3 %*.*s error %d\n",
4425                                     par->nc_nlen, par->nc_nlen, par->nc_name,
4426                                     par->nc_error);
4427                                 _cache_put(par);
4428                                 _cache_ncp_gen_exit(ncp);
4429                                 return(error);
4430                         }
4431                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
4432                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
4433                 }
4434                 _cache_put(par);
4435                 /* loop */
4436         }
4437
4438         /*
4439          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
4440          * ncp's and reattach them.  If this occurs the original ncp is marked
4441          * EAGAIN to force a relookup.
4442          *
4443          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
4444          * ncp must already be resolved.
4445          */
4446         if (dvp) {
4447                 nctmp.mount = mp;
4448                 nctmp.ncp = ncp;
4449                 *genp += 4;     /* setvp bumps the generation */
4450                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
4451                 vrele(dvp);
4452         } else {
4453                 ncp->nc_error = EPERM;
4454         }
4455
4456         if (ncp->nc_error == EAGAIN) {
4457                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
4458                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
4459                 goto restart;
4460         }
4461         _cache_ncp_gen_exit(ncp);
4462
4463         return(ncp->nc_error);
4464 }
4465
4466 /*
4467  * Resolve the ncp associated with a mount point.  Such ncp's almost always
4468  * remain resolved and this routine is rarely called.  NFS MPs tends to force
4469  * re-resolution more often due to its mac-truck-smash-the-namecache
4470  * method of tracking namespace changes.
4471  *
4472  * The semantics for this call is that the passed ncp must be locked on
4473  * entry and will be locked on return.  However, if we actually have to
4474  * resolve the mount point we temporarily unlock the entry in order to
4475  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
4476  * the unlock we have to recheck the flags after we relock.
4477  */
4478 static int
4479 cache_resolve_mp(struct mount *mp, int adjgen)
4480 {
4481         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
4482         struct vnode *vp;
4483         int error;
4484
4485         KKASSERT(mp != NULL);
4486
4487         /*
4488          * If the ncp is already resolved we have nothing to do.  However,
4489          * we do want to guarentee that a usable vnode is returned when
4490          * a vnode is present, so make sure it hasn't been reclaimed.
4491          */
4492         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4493                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
4494                         _cache_setunresolved(ncp, adjgen);
4495         }
4496
4497         if (ncp->nc_flag & NCF_UNRESOLVED) {
4498                 /*
4499                  * ncp must be unlocked across the vfs_busy(), but
4500                  * once busied lock ordering is ncp(s), then vnodes,
4501                  * so we must relock the ncp before issuing the VFS_ROOT().
4502                  */
4503                 _cache_unlock(ncp);
4504                 while (vfs_busy(mp, 0))
4505                         ;
4506                 _cache_lock(ncp);
4507                 error = VFS_ROOT(mp, &vp);
4508
4509                 /*
4510                  * recheck the ncp state after relocking.
4511                  */
4512                 if (ncp->nc_flag & NCF_UNRESOLVED) {
4513                         ncp->nc_error = error;
4514                         if (error == 0) {
4515                                 _cache_setvp(mp, ncp, vp, adjgen);
4516                                 vput(vp);
4517                         } else {
4518                                 kprintf("[diagnostic] cache_resolve_mp: failed"
4519                                         " to resolve mount %p err=%d ncp=%p\n",
4520                                         mp, error, ncp);
4521                                 _cache_setvp(mp, ncp, NULL, adjgen);
4522                         }
4523                 } else if (error == 0) {
4524                         vput(vp);
4525                 }
4526                 vfs_unbusy(mp);
4527         }
4528         return(ncp->nc_error);
4529 }
4530
4531 /*
4532  * Resolve the parent vnode
4533  */
4534 int
4535 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp)
4536 {
4537         struct namecache *par_tmp;
4538         struct namecache *par;
4539         struct namecache *ncp;
4540         struct nchandle nctmp;
4541         struct mount *mp;
4542         struct vnode *dvp;
4543         int error;
4544
4545         *dvpp = NULL;
4546         ncp = nch->ncp;
4547         mp = nch->mount;
4548         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
4549
4550         /*
4551          * Treat this as a mount point even if it has a parent (e.g.
4552          * null-mount).  Return a NULL dvp and no error.
4553          */
4554         if (ncp == mp->mnt_ncmountpt.ncp)
4555                 return 0;
4556
4557         /*
4558          * If the ncp was destroyed there is no parent directory, return
4559          * EINVAL.
4560          */
4561         if (ncp->nc_flag & NCF_DESTROYED)
4562                 return(EINVAL);
4563
4564         /*
4565          * No parent if at the root of a filesystem, no error.  Typically
4566          * not applicable to null-mounts.  This case should have been caught
4567          * in the above ncmountpt check.
4568          */
4569         if (ncp->nc_parent == NULL)
4570                 return 0;
4571
4572         /*
4573          * Resolve the parent dvp.
4574          *
4575          * The vp's of the parent directories in the chain are held via vhold()
4576          * due to the existance of the child, and should not disappear.
4577          * However, there are cases where they can disappear:
4578          *
4579          *      - due to filesystem I/O errors.
4580          *      - due to NFS being stupid about tracking the namespace and
4581          *        destroys the namespace for entire directories quite often.
4582          *      - due to forced unmounts.
4583          *      - due to an rmdir (parent will be marked DESTROYED)
4584          *
4585          * When this occurs we have to track the chain backwards and resolve
4586          * it, looping until the resolver catches up to the current node.  We
4587          * could recurse here but we might run ourselves out of kernel stack
4588          * so we do it in a more painful manner.  This situation really should
4589          * not occur all that often, or if it does not have to go back too
4590          * many nodes to resolve the ncp.
4591          */
4592         while ((dvp = cache_dvpref(ncp)) == NULL) {
4593                 /*
4594                  * This case can occur if a process is CD'd into a
4595                  * directory which is then rmdir'd.  If the parent is marked
4596                  * destroyed there is no point trying to resolve it.
4597                  */
4598                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
4599                         return(ENOENT);
4600                 par = ncp->nc_parent;
4601                 _cache_hold(par);
4602                 _cache_lock(par);
4603                 while ((par_tmp = par->nc_parent) != NULL &&
4604                        par_tmp->nc_vp == NULL) {
4605                         _cache_hold(par_tmp);
4606                         _cache_lock(par_tmp);
4607                         _cache_put(par);
4608                         par = par_tmp;
4609                 }
4610                 if (par->nc_parent == NULL) {
4611                         kprintf("EXDEV case 2 %*.*s\n",
4612                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4613                         _cache_put(par);
4614                         return (EXDEV);
4615                 }
4616
4617                 /*
4618                  * The parent is not set in stone, ref and lock it to prevent
4619                  * it from disappearing.  Also note that due to renames it
4620                  * is possible for our ncp to move and for par to no longer
4621                  * be one of its parents.  We resolve it anyway, the loop
4622                  * will handle any moves.
4623                  */
4624                 _cache_get(par);        /* additional hold/lock */
4625                 _cache_put(par);        /* from earlier hold/lock */
4626                 if (par == nch->mount->mnt_ncmountpt.ncp) {
4627                         cache_resolve_mp(nch->mount, 1);
4628                 } else if ((dvp = cache_dvpref(par)) == NULL) {
4629                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
4630                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4631                         _cache_put(par);
4632                         continue;
4633                 } else {
4634                         if (par->nc_flag & NCF_UNRESOLVED) {
4635                                 nctmp.mount = mp;
4636                                 nctmp.ncp = par;
4637                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
4638                         }
4639                         vrele(dvp);
4640                 }
4641                 if ((error = par->nc_error) != 0) {
4642                         if (par->nc_error != EAGAIN) {
4643                                 kprintf("EXDEV case 3 %*.*s error %d\n",
4644                                     par->nc_nlen, par->nc_nlen, par->nc_name,
4645                                     par->nc_error);
4646                                 _cache_put(par);
4647                                 return(error);
4648                         }
4649                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
4650                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
4651                 }
4652                 _cache_put(par);
4653                 /* loop */
4654         }
4655
4656         /*
4657          * We have a referenced dvp
4658          */
4659         *dvpp = dvp;
4660         return 0;
4661 }
4662
4663 /*
4664  * Clean out negative cache entries when too many have accumulated.
4665  */
4666 static void
4667 _cache_cleanneg(long count)
4668 {
4669         struct pcpu_ncache *pn;
4670         struct namecache *ncp;
4671         static uint32_t neg_rover;
4672         uint32_t n;
4673         long vnegs;
4674
4675         n = neg_rover++;        /* SMP heuristical, race ok */
4676         cpu_ccfence();
4677         n = n % (uint32_t)ncpus;
4678
4679         /*
4680          * Normalize vfscache_negs and count.  count is sometimes based
4681          * on vfscache_negs.  vfscache_negs is heuristical and can sometimes
4682          * have crazy values.
4683          */
4684         vnegs = vfscache_negs;
4685         cpu_ccfence();
4686         if (vnegs <= MINNEG)
4687                 vnegs = MINNEG;
4688         if (count < 1)
4689                 count = 1;
4690
4691         pn = &pcpu_ncache[n];
4692         spin_lock(&pn->neg_spin);
4693         count = pn->neg_count * count / vnegs + 1;
4694         spin_unlock(&pn->neg_spin);
4695
4696         /*
4697          * Attempt to clean out the specified number of negative cache
4698          * entries.
4699          */
4700         while (count > 0) {
4701                 spin_lock(&pn->neg_spin);
4702                 ncp = TAILQ_FIRST(&pn->neg_list);
4703                 if (ncp == NULL) {
4704                         spin_unlock(&pn->neg_spin);
4705                         break;
4706                 }
4707                 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
4708                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
4709                 _cache_hold(ncp);
4710                 spin_unlock(&pn->neg_spin);
4711
4712                 /*
4713                  * This can race, so we must re-check that the ncp
4714                  * is on the ncneg.list after successfully locking it.
4715                  *
4716                  * Don't scrap actively referenced ncps.  There should be
4717                  * 3 refs.  The natural ref, one from being on the neg list,
4718                  * and one from us.
4719                  *
4720                  * Recheck fields after successfully locking to ensure
4721                  * that it is in-fact still on the negative list with no
4722                  * extra refs.
4723                  *
4724                  * WARNING! On the ncneglist scan any race against other
4725                  *          destructors (zaps or cache_inval_vp_quick() calls)
4726                  *          will have already unresolved the ncp and cause
4727                  *          us to drop instead of zap.  This fine, if
4728                  *          our drop winds up being the last one it will
4729                  *          kfree() the ncp.
4730                  */
4731                 if (_cache_lock_special(ncp) == 0) {
4732                         if (ncp->nc_vp == NULL &&
4733                             ncp->nc_refs == 3 &&
4734                             (ncp->nc_flag & NCF_UNRESOLVED) == 0)
4735                         {
4736                                 ++pcpu_ncache[mycpu->gd_cpuid].clean_neg_count;
4737                                 cache_zap(ncp);
4738                         } else {
4739                                 _cache_unlock(ncp);
4740                                 _cache_drop(ncp);
4741                         }
4742                 } else {
4743                         _cache_drop(ncp);
4744                 }
4745                 --count;
4746         }
4747 }
4748
4749 /*
4750  * Clean out unresolved cache entries when too many have accumulated.
4751  * Resolved cache entries are cleaned out via the vnode reclamation
4752  * mechanism and by _cache_cleanneg().
4753  */
4754 static void
4755 _cache_cleanpos(long ucount, long xcount)
4756 {
4757         static volatile int rover;
4758         struct nchash_head *nchpp;
4759         struct namecache *ncp;
4760         long count;
4761         int rover_copy;
4762
4763         /*
4764          * Don't burn too much cpu looking for stuff
4765          */
4766         count = (ucount > xcount) ? ucount : xcount;
4767         count = count * 4;
4768
4769         /*
4770          * Attempt to clean out the specified number of cache entries.
4771          */
4772         while (count > 0 && (ucount > 0 || xcount > 0)) {
4773                 rover_copy = atomic_fetchadd_int(&rover, 1);
4774                 cpu_ccfence();
4775                 nchpp = NCHHASH(rover_copy);
4776
4777                 if (TAILQ_FIRST(&nchpp->list) == NULL) {
4778                         --count;
4779                         continue;
4780                 }
4781
4782                 /*
4783                  * Get the next ncp
4784                  */
4785                 spin_lock(&nchpp->spin);
4786                 ncp = TAILQ_FIRST(&nchpp->list);
4787
4788                 /*
4789                  * Skip placeholder ncp's.  Do not shift their
4790                  * position in the list.
4791                  */
4792                 while (ncp && (ncp->nc_flag & NCF_DUMMY))
4793                         ncp = TAILQ_NEXT(ncp, nc_hash);
4794
4795                 if (ncp) {
4796                         /*
4797                          * Move to end of list
4798                          */
4799                         TAILQ_REMOVE(&nchpp->list, ncp, nc_hash);
4800                         TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash);
4801
4802                         if (ncp->nc_refs != ncpbaserefs(ncp)) {
4803                                 /*
4804                                  * Do not destroy internal nodes that have
4805                                  * children or nodes which have thread
4806                                  * references.
4807                                  */
4808                                 ncp = NULL;
4809                         } else if (ucount > 0 &&
4810                                    (ncp->nc_flag & NCF_UNRESOLVED))
4811                         {
4812                                 /*
4813                                  * Destroy unresolved nodes if asked.
4814                                  */
4815                                 --ucount;
4816                                 --xcount;
4817                                 _cache_hold(ncp);
4818                         } else if (xcount > 0) {
4819                                 /*
4820                                  * Destroy any other node if asked.
4821                                  */
4822                                 --xcount;
4823                                 _cache_hold(ncp);
4824                         } else {
4825                                 /*
4826                                  * Otherwise don't
4827                                  */
4828                                 ncp = NULL;
4829                         }
4830                 }
4831                 spin_unlock(&nchpp->spin);
4832
4833                 /*
4834                  * Try to scap the ncp if we can do so non-blocking.
4835                  * We must re-check nc_refs after locking, and it will
4836                  * have one additional ref from above.
4837                  */
4838                 if (ncp) {
4839                         if (_cache_lock_special(ncp) == 0) {
4840                                 if (ncp->nc_refs == 1 + ncpbaserefs(ncp)) {
4841                                         ++pcpu_ncache[mycpu->gd_cpuid].
4842                                                 clean_pos_count;
4843                                         cache_zap(ncp);
4844                                 } else {
4845                                         _cache_unlock(ncp);
4846                                         _cache_drop(ncp);
4847                                 }
4848                         } else {
4849                                 _cache_drop(ncp);
4850                         }
4851                 }
4852                 --count;
4853         }
4854 }
4855
4856 /*
4857  * This is a kitchen sink function to clean out ncps which we
4858  * tried to zap from cache_drop() but failed because we were
4859  * unable to acquire the parent lock.
4860  *
4861  * Such entries can also be removed via cache_inval_vp(), such
4862  * as when unmounting.
4863  */
4864 static void
4865 _cache_cleandefered(void)
4866 {
4867         struct nchash_head *nchpp;
4868         struct namecache *ncp;
4869         struct namecache dummy;
4870         int i;
4871
4872         /*
4873          * Create a list iterator.  DUMMY indicates that this is a list
4874          * iterator, DESTROYED prevents matches by lookup functions.
4875          */
4876         numdefered = 0;
4877         pcpu_ncache[mycpu->gd_cpuid].numdefered = 0;
4878         bzero(&dummy, sizeof(dummy));
4879         dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY;
4880         dummy.nc_refs = 1;
4881
4882         for (i = 0; i <= nchash; ++i) {
4883                 nchpp = &nchashtbl[i];
4884
4885                 spin_lock(&nchpp->spin);
4886                 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
4887                 ncp = &dummy;
4888                 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) {
4889                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
4890                                 continue;
4891                         TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4892                         TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash);
4893                         _cache_hold(ncp);
4894                         spin_unlock(&nchpp->spin);
4895                         if (_cache_lock_nonblock(ncp) == 0) {
4896                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
4897                                 _cache_unlock(ncp);
4898                         }
4899                         _cache_drop(ncp);
4900                         spin_lock(&nchpp->spin);
4901                         ncp = &dummy;
4902                 }
4903                 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4904                 spin_unlock(&nchpp->spin);
4905         }
4906 }
4907
4908 /*
4909  * Name cache initialization, from vfsinit() when we are booting
4910  */
4911 void
4912 nchinit(void)
4913 {
4914         struct pcpu_ncache *pn;
4915         globaldata_t gd;
4916         int i;
4917
4918         /*
4919          * Per-cpu accounting and negative hit list
4920          */
4921         pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
4922                               M_VFSCACHEAUX, M_WAITOK|M_ZERO);
4923         for (i = 0; i < ncpus; ++i) {
4924                 pn = &pcpu_ncache[i];
4925                 TAILQ_INIT(&pn->neg_list);
4926                 spin_init(&pn->neg_spin, "ncneg");
4927                 spin_init(&pn->umount_spin, "ncumm");
4928         }
4929
4930         /*
4931          * Initialise per-cpu namecache effectiveness statistics.
4932          */
4933         for (i = 0; i < ncpus; ++i) {
4934                 gd = globaldata_find(i);
4935                 gd->gd_nchstats = &nchstats[i];
4936         }
4937
4938         /*
4939          * Create a generous namecache hash table
4940          */
4941         nchashtbl = hashinit_ext(vfs_inodehashsize(),
4942                                  sizeof(struct nchash_head),
4943                                  M_VFSCACHEAUX, &nchash);
4944         for (i = 0; i <= (int)nchash; ++i) {
4945                 TAILQ_INIT(&nchashtbl[i].list);
4946                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4947         }
4948         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4949                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4950         nclockwarn = 5 * hz;
4951 }
4952
4953 /*
4954  * Called from start_init() to bootstrap the root filesystem.  Returns
4955  * a referenced, unlocked namecache record to serve as a root or the
4956  * root of the system.
4957  *
4958  * Adjust our namecache counts
4959  */
4960 void
4961 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4962 {
4963         /*struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];*/
4964
4965         /* nc_parent is NULL, doesn't count as a leaf or unresolved */
4966         /*atomic_add_long(&pn->vfscache_leafs, 1);*/
4967         /*atomic_add_long(&pn->vfscache_unres, 1);*/
4968
4969         nch->ncp = cache_alloc(0);
4970         nch->mount = mp;
4971         _cache_mntref(mp);
4972         if (vp)
4973                 _cache_setvp(nch->mount, nch->ncp, vp, 1);
4974 }
4975
4976 /*
4977  * vfs_cache_setroot()
4978  *
4979  *      Create an association between the root of our namecache and
4980  *      the root vnode.  This routine may be called several times during
4981  *      booting.
4982  *
4983  *      If the caller intends to save the returned namecache pointer somewhere
4984  *      it must cache_hold() it.
4985  */
4986 void
4987 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4988 {
4989         struct vnode *ovp;
4990         struct nchandle onch;
4991
4992         ovp = rootvnode;
4993         onch = rootnch;
4994         rootvnode = nvp;
4995         if (nch)
4996                 rootnch = *nch;
4997         else
4998                 cache_zero(&rootnch);
4999         if (ovp)
5000                 vrele(ovp);
5001         if (onch.ncp)
5002                 cache_drop(&onch);
5003 }
5004
5005 /*
5006  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
5007  * topology and is being removed as quickly as possible.  The new VOP_N*()
5008  * API calls are required to make specific adjustments using the supplied
5009  * ncp pointers rather then just bogusly purging random vnodes.
5010  *
5011  * Invalidate all namecache entries to a particular vnode as well as
5012  * any direct children of that vnode in the namecache.  This is a
5013  * 'catch all' purge used by filesystems that do not know any better.
5014  *
5015  * Note that the linkage between the vnode and its namecache entries will
5016  * be removed, but the namecache entries themselves might stay put due to
5017  * active references from elsewhere in the system or due to the existance of
5018  * the children.   The namecache topology is left intact even if we do not
5019  * know what the vnode association is.  Such entries will be marked
5020  * NCF_UNRESOLVED.
5021  */
5022 void
5023 cache_purge(struct vnode *vp)
5024 {
5025         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
5026 }
5027
5028 __read_mostly static int disablecwd;
5029 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
5030     "Disable getcwd");
5031
5032 /*
5033  * MPALMOSTSAFE
5034  */
5035 int
5036 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap)
5037 {
5038         u_int buflen;
5039         int error;
5040         char *buf;
5041         char *bp;
5042
5043         if (disablecwd)
5044                 return (ENODEV);
5045
5046         buflen = uap->buflen;
5047         if (buflen == 0)
5048                 return (EINVAL);
5049         if (buflen > MAXPATHLEN)
5050                 buflen = MAXPATHLEN;
5051
5052         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
5053         bp = kern_getcwd(buf, buflen, &error);
5054         if (error == 0)
5055                 error = copyout(bp, uap->buf, strlen(bp) + 1);
5056         kfree(buf, M_TEMP);
5057         return (error);
5058 }
5059
5060 char *
5061 kern_getcwd(char *buf, size_t buflen, int *error)
5062 {
5063         struct proc *p = curproc;
5064         char *bp;
5065         int i, slash_prefixed;
5066         struct filedesc *fdp;
5067         struct nchandle nch;
5068         struct namecache *ncp;
5069
5070         bp = buf;
5071         bp += buflen - 1;
5072         *bp = '\0';
5073         fdp = p->p_fd;
5074         slash_prefixed = 0;
5075
5076         nch = fdp->fd_ncdir;
5077         ncp = nch.ncp;
5078         if (ncp)
5079                 _cache_hold(ncp);
5080
5081         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
5082                nch.mount != fdp->fd_nrdir.mount)
5083         ) {
5084                 if (ncp->nc_flag & NCF_DESTROYED) {
5085                         _cache_drop(ncp);
5086                         ncp = NULL;
5087                         break;
5088                 }
5089                 /*
5090                  * While traversing upwards if we encounter the root
5091                  * of the current mount we have to skip to the mount point
5092                  * in the underlying filesystem.
5093                  */
5094                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
5095                         nch = nch.mount->mnt_ncmounton;
5096                         _cache_drop(ncp);
5097                         ncp = nch.ncp;
5098                         if (ncp)
5099                                 _cache_hold(ncp);
5100                         continue;
5101                 }
5102
5103                 /*
5104                  * Prepend the path segment
5105                  */
5106                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
5107                         if (bp == buf) {
5108                                 *error = ERANGE;
5109                                 bp = NULL;
5110                                 goto done;
5111                         }
5112                         *--bp = ncp->nc_name[i];
5113                 }
5114                 if (bp == buf) {
5115                         *error = ERANGE;
5116                         bp = NULL;
5117                         goto done;
5118                 }
5119                 *--bp = '/';
5120                 slash_prefixed = 1;
5121
5122                 /*
5123                  * Go up a directory.  This isn't a mount point so we don't
5124                  * have to check again.
5125                  */
5126                 while ((nch.ncp = ncp->nc_parent) != NULL) {
5127                         if (ncp_shared_lock_disable)
5128                                 _cache_lock(ncp);
5129                         else
5130                                 _cache_lock_shared(ncp);
5131                         if (nch.ncp != ncp->nc_parent) {
5132                                 _cache_unlock(ncp);
5133                                 continue;
5134                         }
5135                         _cache_hold(nch.ncp);
5136                         _cache_unlock(ncp);
5137                         break;
5138                 }
5139                 _cache_drop(ncp);
5140                 ncp = nch.ncp;
5141         }
5142         if (ncp == NULL) {
5143                 *error = ENOENT;
5144                 bp = NULL;
5145                 goto done;
5146         }
5147         if (!slash_prefixed) {
5148                 if (bp == buf) {
5149                         *error = ERANGE;
5150                         bp = NULL;
5151                         goto done;
5152                 }
5153                 *--bp = '/';
5154         }
5155         *error = 0;
5156 done:
5157         if (ncp)
5158                 _cache_drop(ncp);
5159         return (bp);
5160 }
5161
5162 /*
5163  * Thus begins the fullpath magic.
5164  *
5165  * The passed nchp is referenced but not locked.
5166  */
5167 __read_mostly static int disablefullpath;
5168 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
5169     &disablefullpath, 0,
5170     "Disable fullpath lookups");
5171
5172 int
5173 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
5174                char **retbuf, char **freebuf, int guess)
5175 {
5176         struct nchandle fd_nrdir;
5177         struct nchandle nch;
5178         struct namecache *ncp;
5179         struct mount *mp, *new_mp;
5180         char *bp, *buf;
5181         int slash_prefixed;
5182         int error = 0;
5183         int i;
5184
5185         *retbuf = NULL;
5186         *freebuf = NULL;
5187
5188         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
5189         bp = buf + MAXPATHLEN - 1;
5190         *bp = '\0';
5191         if (nchbase)
5192                 fd_nrdir = *nchbase;
5193         else if (p != NULL)
5194                 fd_nrdir = p->p_fd->fd_nrdir;
5195         else
5196                 fd_nrdir = rootnch;
5197         slash_prefixed = 0;
5198         nch = *nchp;
5199         ncp = nch.ncp;
5200         if (ncp)
5201                 _cache_hold(ncp);
5202         mp = nch.mount;
5203
5204         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
5205                 new_mp = NULL;
5206
5207                 /*
5208                  * If we are asked to guess the upwards path, we do so whenever
5209                  * we encounter an ncp marked as a mountpoint. We try to find
5210                  * the actual mountpoint by finding the mountpoint with this
5211                  * ncp.
5212                  */
5213                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
5214                         new_mp = mount_get_by_nc(ncp);
5215                 }
5216                 /*
5217                  * While traversing upwards if we encounter the root
5218                  * of the current mount we have to skip to the mount point.
5219                  */
5220                 if (ncp == mp->mnt_ncmountpt.ncp) {
5221                         new_mp = mp;
5222                 }
5223                 if (new_mp) {
5224                         nch = new_mp->mnt_ncmounton;
5225                         _cache_drop(ncp);
5226                         ncp = nch.ncp;
5227                         if (ncp)
5228                                 _cache_hold(ncp);
5229                         mp = nch.mount;
5230                         continue;
5231                 }
5232
5233                 /*
5234                  * Prepend the path segment
5235                  */
5236                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
5237                         if (bp == buf) {
5238                                 kfree(buf, M_TEMP);
5239                                 error = ENOMEM;
5240                                 goto done;
5241                         }
5242                         *--bp = ncp->nc_name[i];
5243                 }
5244                 if (bp == buf) {
5245                         kfree(buf, M_TEMP);
5246                         error = ENOMEM;
5247                         goto done;
5248                 }
5249                 *--bp = '/';
5250                 slash_prefixed = 1;
5251
5252                 /*
5253                  * Go up a directory.  This isn't a mount point so we don't
5254                  * have to check again.
5255                  *
5256                  * We can only safely access nc_parent with ncp held locked.
5257                  */
5258                 while ((nch.ncp = ncp->nc_parent) != NULL) {
5259                         _cache_lock_shared(ncp);
5260                         if (nch.ncp != ncp->nc_parent) {
5261                                 _cache_unlock(ncp);
5262                                 continue;
5263                         }
5264                         _cache_hold(nch.ncp);
5265                         _cache_unlock(ncp);
5266                         break;
5267                 }
5268                 _cache_drop(ncp);
5269                 ncp = nch.ncp;
5270         }
5271         if (ncp == NULL) {
5272                 kfree(buf, M_TEMP);
5273                 error = ENOENT;
5274                 goto done;
5275         }
5276
5277         if (!slash_prefixed) {
5278                 if (bp == buf) {
5279                         kfree(buf, M_TEMP);
5280                         error = ENOMEM;
5281                         goto done;
5282                 }
5283                 *--bp = '/';
5284         }
5285         *retbuf = bp;
5286         *freebuf = buf;
5287         error = 0;
5288 done:
5289         if (ncp)
5290                 _cache_drop(ncp);
5291         return(error);
5292 }
5293
5294 int
5295 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
5296             char **freebuf, int guess)
5297 {
5298         struct namecache *ncp;
5299         struct nchandle nch;
5300         int error;
5301
5302         *freebuf = NULL;
5303         if (disablefullpath)
5304                 return (ENODEV);
5305
5306         if (p == NULL)
5307                 return (EINVAL);
5308
5309         /* vn is NULL, client wants us to use p->p_textvp */
5310         if (vn == NULL) {
5311                 if ((vn = p->p_textvp) == NULL)
5312                         return (EINVAL);
5313         }
5314         spin_lock_shared(&vn->v_spin);
5315         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
5316                 if (ncp->nc_nlen)
5317                         break;
5318         }
5319         if (ncp == NULL) {
5320                 spin_unlock_shared(&vn->v_spin);
5321                 return (EINVAL);
5322         }
5323         _cache_hold(ncp);
5324         spin_unlock_shared(&vn->v_spin);
5325
5326         nch.ncp = ncp;
5327         nch.mount = vn->v_mount;
5328         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
5329         _cache_drop(ncp);
5330         return (error);
5331 }
5332
5333 void
5334 vfscache_rollup_cpu(struct globaldata *gd)
5335 {
5336         struct pcpu_ncache *pn;
5337         long count;
5338
5339         if (pcpu_ncache == NULL)
5340                 return;
5341         pn = &pcpu_ncache[gd->gd_cpuid];
5342
5343         /*
5344          * namecache statistics
5345          */
5346         if (pn->vfscache_count) {
5347                 count = atomic_swap_long(&pn->vfscache_count, 0);
5348                 atomic_add_long(&vfscache_count, count);
5349         }
5350         if (pn->vfscache_leafs) {
5351                 count = atomic_swap_long(&pn->vfscache_leafs, 0);
5352                 atomic_add_long(&vfscache_leafs, count);
5353         }
5354         if (pn->vfscache_unres) {
5355                 count = atomic_swap_long(&pn->vfscache_unres, 0);
5356                 atomic_add_long(&vfscache_unres, count);
5357         }
5358         if (pn->vfscache_negs) {
5359                 count = atomic_swap_long(&pn->vfscache_negs, 0);
5360                 atomic_add_long(&vfscache_negs, count);
5361         }
5362
5363         /*
5364          * hysteresis based cleanings
5365          */
5366         if (pn->inv_kid_quick_count) {
5367                 count = atomic_swap_long(&pn->inv_kid_quick_count, 0);
5368                 atomic_add_long(&inv_kid_quick_count, count);
5369         }
5370         if (pn->inv_ncp_quick_count) {
5371                 count = atomic_swap_long(&pn->inv_ncp_quick_count, 0);
5372                 atomic_add_long(&inv_ncp_quick_count, count);
5373         }
5374         if (pn->clean_pos_count) {
5375                 count = atomic_swap_long(&pn->clean_pos_count, 0);
5376                 atomic_add_long(&clean_pos_count, count);
5377         }
5378         if (pn->clean_neg_count) {
5379                 count = atomic_swap_long(&pn->clean_neg_count, 0);
5380                 atomic_add_long(&clean_neg_count, count);
5381         }
5382
5383         if (pn->numdefered) {
5384                 count = atomic_swap_long(&pn->numdefered, 0);
5385                 atomic_add_long(&numdefered, count);
5386         }
5387 }