sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/mount.h>
  70 #include <sys/vnode.h>
  71 #include <sys/malloc.h>
  72 #include <sys/sysproto.h>
  73 #include <sys/spinlock.h>
  74 #include <sys/proc.h>
  75 #include <sys/namei.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/sysref2.h>
  85 #include <sys/spinlock2.h>
  86
  87 #define MAX_RECURSION_DEPTH     64
  88
  89 /*
  90  * Random lookups in the cache are accomplished with a hash table using
  91  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
  92  *
  93  * Negative entries may exist and correspond to resolved namecache
  94  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  95  * will be set if the entry corresponds to a whited-out directory entry
  96  * (verses simply not finding the entry at all).   ncneglist is locked
  97  * with a global spinlock (ncspin).
  98  *
  99  * MPSAFE RULES:
 100  *
 101  * (1) A ncp must be referenced before it can be locked.
 102  *
 103  * (2) A ncp must be locked in order to modify it.
 104  *
 105  * (3) ncp locks are always ordered child -> parent.  That may seem
 106  *     backwards but forward scans use the hash table and thus can hold
 107  *     the parent unlocked when traversing downward.
 108  *
 109  *     This allows insert/rename/delete/dot-dot and other operations
 110  *     to use ncp->nc_parent links.
 111  *
 112  *     This also prevents a locked up e.g. NFS node from creating a
 113  *     chain reaction all the way back to the root vnode / namecache.
 114  *
 115  * (4) parent linkages require both the parent and child to be locked.
 116  */
 117
 118 /*
 119  * Structures associated with name cacheing.
 120  */
 121 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 122 #define MINNEG                  1024
 123 #define MINPOS                  1024
 124 #define NCMOUNT_NUMCACHE        1009    /* prime number */
 125
 126 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 127
 128 LIST_HEAD(nchash_list, namecache);
 129
 130 /*
 131  * Don't cachealign, but at least pad to 32 bytes so entries
 132  * don't cross a cache line.
 133  */
 134 struct nchash_head {
 135        struct nchash_list list; /* 16 bytes */
 136        struct spinlock  spin;   /* 8 bytes */
 137        long     pad01;          /* 8 bytes */
 138 };
 139
 140 struct ncmount_cache {
 141         struct spinlock spin;
 142         struct namecache *ncp;
 143         struct mount *mp;
 144         int isneg;              /* if != 0 mp is originator and not target */
 145 };
 146
 147 static struct nchash_head       *nchashtbl;
 148 static struct namecache_list    ncneglist;
 149 static struct spinlock          ncspin;
 150 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 151
 152 /*
 153  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 154  * to create the namecache infrastructure leading to a dangling vnode.
 155  *
 156  * 0    Only errors are reported
 157  * 1    Successes are reported
 158  * 2    Successes + the whole directory scan is reported
 159  * 3    Force the directory scan code run as if the parent vnode did not
 160  *      have a namecache record, even if it does have one.
 161  */
 162 static int      ncvp_debug;
 163 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 164     "Namecache debug level (0-3)");
 165
 166 static u_long   nchash;                 /* size of hash table */
 167 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 168     "Size of namecache hash table");
 169
 170 static int      ncnegflush = 10;        /* burst for negative flush */
 171 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 172     "Batch flush negative entries");
 173
 174 static int      ncposflush = 10;        /* burst for positive flush */
 175 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 176     "Batch flush positive entries");
 177
 178 static int      ncnegfactor = 16;       /* ratio of negative entries */
 179 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 180     "Ratio of namecache negative entries");
 181
 182 static int      nclockwarn;             /* warn on locked entries in ticks */
 183 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 184     "Warn on locked namecache entries in ticks");
 185
 186 static int      numdefered;             /* number of cache entries allocated */
 187 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 188     "Number of cache entries allocated");
 189
 190 static int      ncposlimit;             /* number of cache entries allocated */
 191 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 192     "Number of cache entries allocated");
 193
 194 static int      ncp_shared_lock_disable = 0;
 195 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 196            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 197
 198 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 199     "sizeof(struct vnode)");
 200 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 201     "sizeof(struct namecache)");
 202
 203 static int      ncmount_cache_enable = 1;
 204 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 205            &ncmount_cache_enable, 0, "mount point cache");
 206 static long     ncmount_cache_hit;
 207 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW,
 208             &ncmount_cache_hit, 0, "mpcache hits");
 209 static long     ncmount_cache_miss;
 210 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW,
 211             &ncmount_cache_miss, 0, "mpcache misses");
 212 static long     ncmount_cache_overwrite;
 213 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
 214             &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
 215
 216 static __inline void _cache_drop(struct namecache *ncp);
 217 static int cache_resolve_mp(struct mount *mp);
 218 static struct vnode *cache_dvpref(struct namecache *ncp);
 219 static void _cache_lock(struct namecache *ncp);
 220 static void _cache_setunresolved(struct namecache *ncp);
 221 static void _cache_cleanneg(int count);
 222 static void _cache_cleanpos(int count);
 223 static void _cache_cleandefered(void);
 224 static void _cache_unlink(struct namecache *ncp);
 225
 226 /*
 227  * The new name cache statistics
 228  */
 229 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 230 static int numneg;
 231 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
 232     "Number of negative namecache entries");
 233 static int numcache;
 234 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
 235     "Number of namecaches entries");
 236
 237 struct nchstats nchstats[SMP_MAXCPU];
 238 /*
 239  * Export VFS cache effectiveness statistics to user-land.
 240  *
 241  * The statistics are left for aggregation to user-land so
 242  * neat things can be achieved, like observing per-CPU cache
 243  * distribution.
 244  */
 245 static int
 246 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 247 {
 248         struct globaldata *gd;
 249         int i, error;
 250
 251         error = 0;
 252         for (i = 0; i < ncpus; ++i) {
 253                 gd = globaldata_find(i);
 254                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 255                         sizeof(struct nchstats))))
 256                         break;
 257         }
 258
 259         return (error);
 260 }
 261 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 262   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 263
 264 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 265
 266 /*
 267  * Cache mount points and namecache records in order to avoid unnecessary
 268  * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
 269  * performance and is particularly important on multi-socket systems to
 270  * reduce cache-line ping-ponging.
 271  *
 272  * Try to keep the pcpu structure within one cache line (~64 bytes).
 273  */
 274 #define MNTCACHE_COUNT      5
 275
 276 struct mntcache {
 277         struct mount    *mntary[MNTCACHE_COUNT];
 278         struct namecache *ncp1;
 279         struct namecache *ncp2;
 280         struct nchandle  ncdir;
 281         int             iter;
 282         int             unused01;
 283 } __cachealign;
 284
 285 static struct mntcache  pcpu_mntcache[MAXCPU];
 286
 287 static
 288 void
 289 _cache_mntref(struct mount *mp)
 290 {
 291         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
 292         int i;
 293
 294         for (i = 0; i < MNTCACHE_COUNT; ++i) {
 295                 if (cache->mntary[i] != mp)
 296                         continue;
 297                 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL))
 298                         return;
 299         }
 300         atomic_add_int(&mp->mnt_refs, 1);
 301 }
 302
 303 static
 304 void
 305 _cache_mntrel(struct mount *mp)
 306 {
 307         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
 308         int i;
 309
 310         for (i = 0; i < MNTCACHE_COUNT; ++i) {
 311                 if (cache->mntary[i] == NULL) {
 312                         mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
 313                         if (mp == NULL)
 314                                 return;
 315                 }
 316         }
 317         i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT);
 318         mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
 319         if (mp)
 320                 atomic_add_int(&mp->mnt_refs, -1);
 321 }
 322
 323 /*
 324  * Clears all cached mount points on all cpus.  This routine should only
 325  * be called when we are waiting for a mount to clear, e.g. so we can
 326  * unmount.
 327  */
 328 void
 329 cache_clearmntcache(void)
 330 {
 331         int n;
 332
 333         for (n = 0; n < ncpus; ++n) {
 334                 struct mntcache *cache = &pcpu_mntcache[n];
 335                 struct namecache *ncp;
 336                 struct mount *mp;
 337                 int i;
 338
 339                 for (i = 0; i < MNTCACHE_COUNT; ++i) {
 340                         if (cache->mntary[i]) {
 341                                 mp = atomic_swap_ptr(
 342                                         (void *)&cache->mntary[i], NULL);
 343                                 if (mp)
 344                                         atomic_add_int(&mp->mnt_refs, -1);
 345                         }
 346                 }
 347                 if (cache->ncp1) {
 348                         ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL);
 349                         if (ncp)
 350                                 _cache_drop(ncp);
 351                 }
 352                 if (cache->ncp2) {
 353                         ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL);
 354                         if (ncp)
 355                                 _cache_drop(ncp);
 356                 }
 357                 if (cache->ncdir.ncp) {
 358                         ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL);
 359                         if (ncp)
 360                                 _cache_drop(ncp);
 361                 }
 362                 if (cache->ncdir.mount) {
 363                         mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL);
 364                         if (mp)
 365                                 atomic_add_int(&mp->mnt_refs, -1);
 366                 }
 367         }
 368 }
 369
 370
 371 /*
 372  * Namespace locking.  The caller must already hold a reference to the
 373  * namecache structure in order to lock/unlock it.  This function prevents
 374  * the namespace from being created or destroyed by accessors other then
 375  * the lock holder.
 376  *
 377  * Note that holding a locked namecache structure prevents other threads
 378  * from making namespace changes (e.g. deleting or creating), prevents
 379  * vnode association state changes by other threads, and prevents the
 380  * namecache entry from being resolved or unresolved by other threads.
 381  *
 382  * An exclusive lock owner has full authority to associate/disassociate
 383  * vnodes and resolve/unresolve the locked ncp.
 384  *
 385  * A shared lock owner only has authority to acquire the underlying vnode,
 386  * if any.
 387  *
 388  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 389  * fact (when locking) or cleared prior to unlocking.
 390  *
 391  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 392  *           or recycled, but it does NOT help you if the vnode had already
 393  *           initiated a recyclement.  If this is important, use cache_get()
 394  *           rather then cache_lock() (and deal with the differences in the
 395  *           way the refs counter is handled).  Or, alternatively, make an
 396  *           unconditional call to cache_validate() or cache_resolve()
 397  *           after cache_lock() returns.
 398  */
 399 static
 400 void
 401 _cache_lock(struct namecache *ncp)
 402 {
 403         thread_t td;
 404         int didwarn;
 405         int begticks;
 406         int error;
 407         u_int count;
 408
 409         KKASSERT(ncp->nc_refs != 0);
 410         didwarn = 0;
 411         begticks = 0;
 412         td = curthread;
 413
 414         for (;;) {
 415                 count = ncp->nc_lockstatus;
 416                 cpu_ccfence();
 417
 418                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 419                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 420                                               count, count + 1)) {
 421                                 /*
 422                                  * The vp associated with a locked ncp must
 423                                  * be held to prevent it from being recycled.
 424                                  *
 425                                  * WARNING!  If VRECLAIMED is set the vnode
 426                                  * could already be in the middle of a recycle.
 427                                  * Callers must use cache_vref() or
 428                                  * cache_vget() on the locked ncp to
 429                                  * validate the vp or set the cache entry
 430                                  * to unresolved.
 431                                  *
 432                                  * NOTE! vhold() is allowed if we hold a
 433                                  *       lock on the ncp (which we do).
 434                                  */
 435                                 ncp->nc_locktd = td;
 436                                 if (ncp->nc_vp)
 437                                         vhold(ncp->nc_vp);
 438                                 break;
 439                         }
 440                         /* cmpset failed */
 441                         continue;
 442                 }
 443                 if (ncp->nc_locktd == td) {
 444                         KKASSERT((count & NC_SHLOCK_FLAG) == 0);
 445                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 446                                               count, count + 1)) {
 447                                 break;
 448                         }
 449                         /* cmpset failed */
 450                         continue;
 451                 }
 452                 tsleep_interlock(&ncp->nc_locktd, 0);
 453                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 454                                       count | NC_EXLOCK_REQ) == 0) {
 455                         /* cmpset failed */
 456                         continue;
 457                 }
 458                 if (begticks == 0)
 459                         begticks = ticks;
 460                 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
 461                                "clock", nclockwarn);
 462                 if (error == EWOULDBLOCK) {
 463                         if (didwarn == 0) {
 464                                 didwarn = ticks;
 465                                 kprintf("[diagnostic] cache_lock: "
 466                                         "%s blocked on %p %08x",
 467                                         td->td_comm, ncp, count);
 468                                 kprintf(" \"%*.*s\"\n",
 469                                         ncp->nc_nlen, ncp->nc_nlen,
 470                                         ncp->nc_name);
 471                         }
 472                 }
 473                 /* loop */
 474         }
 475         if (didwarn) {
 476                 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after "
 477                         "%d secs\n",
 478                         td->td_comm,
 479                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 480                         (int)(ticks + (hz / 2) - begticks) / hz);
 481         }
 482 }
 483
 484 /*
 485  * The shared lock works similarly to the exclusive lock except
 486  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 487  * prevent vhold() races, since the moment our cmpset_int succeeds
 488  * another cpu can come in and get its own shared lock.
 489  *
 490  * A critical section is needed to prevent interruption during the
 491  * VHOLD interlock.
 492  */
 493 static
 494 void
 495 _cache_lock_shared(struct namecache *ncp)
 496 {
 497         int didwarn;
 498         int error;
 499         u_int count;
 500         u_int optreq = NC_EXLOCK_REQ;
 501
 502         KKASSERT(ncp->nc_refs != 0);
 503         didwarn = 0;
 504
 505         for (;;) {
 506                 count = ncp->nc_lockstatus;
 507                 cpu_ccfence();
 508
 509                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 510                         crit_enter();
 511                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 512                                       count,
 513                                       (count + 1) | NC_SHLOCK_FLAG |
 514                                                     NC_SHLOCK_VHOLD)) {
 515                                 /*
 516                                  * The vp associated with a locked ncp must
 517                                  * be held to prevent it from being recycled.
 518                                  *
 519                                  * WARNING!  If VRECLAIMED is set the vnode
 520                                  * could already be in the middle of a recycle.
 521                                  * Callers must use cache_vref() or
 522                                  * cache_vget() on the locked ncp to
 523                                  * validate the vp or set the cache entry
 524                                  * to unresolved.
 525                                  *
 526                                  * NOTE! vhold() is allowed if we hold a
 527                                  *       lock on the ncp (which we do).
 528                                  */
 529                                 if (ncp->nc_vp)
 530                                         vhold(ncp->nc_vp);
 531                                 atomic_clear_int(&ncp->nc_lockstatus,
 532                                                  NC_SHLOCK_VHOLD);
 533                                 crit_exit();
 534                                 break;
 535                         }
 536                         /* cmpset failed */
 537                         crit_exit();
 538                         continue;
 539                 }
 540
 541                 /*
 542                  * If already held shared we can just bump the count, but
 543                  * only allow this if nobody is trying to get the lock
 544                  * exclusively.  If we are blocking too long ignore excl
 545                  * requests (which can race/deadlock us).
 546                  *
 547                  * VHOLD is a bit of a hack.  Even though we successfully
 548                  * added another shared ref, the cpu that got the first
 549                  * shared ref might not yet have held the vnode.
 550                  */
 551                 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) {
 552                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 553                                             NC_SHLOCK_REQ |
 554                                             NC_SHLOCK_FLAG)) > 0);
 555                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 556                                               count, count + 1)) {
 557                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 558                                         cpu_pause();
 559                                 break;
 560                         }
 561                         continue;
 562                 }
 563                 tsleep_interlock(ncp, 0);
 564                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 565                                       count | NC_SHLOCK_REQ) == 0) {
 566                         /* cmpset failed */
 567                         continue;
 568                 }
 569                 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
 570                 if (error == EWOULDBLOCK) {
 571                         optreq = 0;
 572                         if (didwarn == 0) {
 573                                 didwarn = ticks - nclockwarn;
 574                                 kprintf("[diagnostic] cache_lock_shared: "
 575                                         "%s blocked on %p %08x",
 576                                         curthread->td_comm, ncp, count);
 577                                 kprintf(" \"%*.*s\"\n",
 578                                         ncp->nc_nlen, ncp->nc_nlen,
 579                                         ncp->nc_name);
 580                         }
 581                 }
 582                 /* loop */
 583         }
 584         if (didwarn) {
 585                 kprintf("[diagnostic] cache_lock_shared: "
 586                         "%s unblocked %*.*s after %d secs\n",
 587                         curthread->td_comm,
 588                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 589                         (int)(ticks - didwarn) / hz);
 590         }
 591 }
 592
 593 /*
 594  * Lock ncp exclusively, return 0 on success.
 595  *
 596  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
 597  *       such as the case where one of its children is locked.
 598  */
 599 static
 600 int
 601 _cache_lock_nonblock(struct namecache *ncp)
 602 {
 603         thread_t td;
 604         u_int count;
 605
 606         td = curthread;
 607
 608         for (;;) {
 609                 count = ncp->nc_lockstatus;
 610
 611                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 612                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 613                                               count, count + 1)) {
 614                                 /*
 615                                  * The vp associated with a locked ncp must
 616                                  * be held to prevent it from being recycled.
 617                                  *
 618                                  * WARNING!  If VRECLAIMED is set the vnode
 619                                  * could already be in the middle of a recycle.
 620                                  * Callers must use cache_vref() or
 621                                  * cache_vget() on the locked ncp to
 622                                  * validate the vp or set the cache entry
 623                                  * to unresolved.
 624                                  *
 625                                  * NOTE! vhold() is allowed if we hold a
 626                                  *       lock on the ncp (which we do).
 627                                  */
 628                                 ncp->nc_locktd = td;
 629                                 if (ncp->nc_vp)
 630                                         vhold(ncp->nc_vp);
 631                                 break;
 632                         }
 633                         /* cmpset failed */
 634                         continue;
 635                 }
 636                 if (ncp->nc_locktd == td) {
 637                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 638                                               count, count + 1)) {
 639                                 break;
 640                         }
 641                         /* cmpset failed */
 642                         continue;
 643                 }
 644                 return(EWOULDBLOCK);
 645         }
 646         return(0);
 647 }
 648
 649 /*
 650  * The shared lock works similarly to the exclusive lock except
 651  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 652  * prevent vhold() races, since the moment our cmpset_int succeeds
 653  * another cpu can come in and get its own shared lock.
 654  *
 655  * A critical section is needed to prevent interruption during the
 656  * VHOLD interlock.
 657  */
 658 static
 659 int
 660 _cache_lock_shared_nonblock(struct namecache *ncp)
 661 {
 662         u_int count;
 663
 664         for (;;) {
 665                 count = ncp->nc_lockstatus;
 666
 667                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 668                         crit_enter();
 669                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 670                                       count,
 671                                       (count + 1) | NC_SHLOCK_FLAG |
 672                                                     NC_SHLOCK_VHOLD)) {
 673                                 /*
 674                                  * The vp associated with a locked ncp must
 675                                  * be held to prevent it from being recycled.
 676                                  *
 677                                  * WARNING!  If VRECLAIMED is set the vnode
 678                                  * could already be in the middle of a recycle.
 679                                  * Callers must use cache_vref() or
 680                                  * cache_vget() on the locked ncp to
 681                                  * validate the vp or set the cache entry
 682                                  * to unresolved.
 683                                  *
 684                                  * NOTE! vhold() is allowed if we hold a
 685                                  *       lock on the ncp (which we do).
 686                                  */
 687                                 if (ncp->nc_vp)
 688                                         vhold(ncp->nc_vp);
 689                                 atomic_clear_int(&ncp->nc_lockstatus,
 690                                                  NC_SHLOCK_VHOLD);
 691                                 crit_exit();
 692                                 break;
 693                         }
 694                         /* cmpset failed */
 695                         crit_exit();
 696                         continue;
 697                 }
 698
 699                 /*
 700                  * If already held shared we can just bump the count, but
 701                  * only allow this if nobody is trying to get the lock
 702                  * exclusively.
 703                  *
 704                  * VHOLD is a bit of a hack.  Even though we successfully
 705                  * added another shared ref, the cpu that got the first
 706                  * shared ref might not yet have held the vnode.
 707                  */
 708                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
 709                     NC_SHLOCK_FLAG) {
 710                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 711                                             NC_SHLOCK_REQ |
 712                                             NC_SHLOCK_FLAG)) > 0);
 713                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 714                                               count, count + 1)) {
 715                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 716                                         cpu_pause();
 717                                 break;
 718                         }
 719                         continue;
 720                 }
 721                 return(EWOULDBLOCK);
 722         }
 723         return(0);
 724 }
 725
 726 /*
 727  * Helper function
 728  *
 729  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
 730  *
 731  *       nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
 732  */
 733 static
 734 void
 735 _cache_unlock(struct namecache *ncp)
 736 {
 737         thread_t td __debugvar = curthread;
 738         u_int count;
 739         u_int ncount;
 740         struct vnode *dropvp;
 741
 742         KKASSERT(ncp->nc_refs >= 0);
 743         KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
 744         KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
 745
 746         count = ncp->nc_lockstatus;
 747         cpu_ccfence();
 748
 749         /*
 750          * Clear nc_locktd prior to the atomic op (excl lock only)
 751          */
 752         if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
 753                 ncp->nc_locktd = NULL;
 754         dropvp = NULL;
 755
 756         for (;;) {
 757                 if ((count &
 758                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
 759                         dropvp = ncp->nc_vp;
 760                         if (count & NC_EXLOCK_REQ)
 761                                 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
 762                         else
 763                                 ncount = 0;
 764
 765                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 766                                               count, ncount)) {
 767                                 if (count & NC_EXLOCK_REQ)
 768                                         wakeup(&ncp->nc_locktd);
 769                                 else if (count & NC_SHLOCK_REQ)
 770                                         wakeup(ncp);
 771                                 break;
 772                         }
 773                         dropvp = NULL;
 774                 } else {
 775                         KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
 776                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 777                                             NC_SHLOCK_REQ |
 778                                             NC_SHLOCK_FLAG)) > 1);
 779                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 780                                               count, count - 1)) {
 781                                 break;
 782                         }
 783                 }
 784                 count = ncp->nc_lockstatus;
 785                 cpu_ccfence();
 786         }
 787
 788         /*
 789          * Don't actually drop the vp until we successfully clean out
 790          * the lock, otherwise we may race another shared lock.
 791          */
 792         if (dropvp)
 793                 vdrop(dropvp);
 794 }
 795
 796 static
 797 int
 798 _cache_lockstatus(struct namecache *ncp)
 799 {
 800         if (ncp->nc_locktd == curthread)
 801                 return(LK_EXCLUSIVE);
 802         if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
 803                 return(LK_SHARED);
 804         return(-1);
 805 }
 806
 807 /*
 808  * cache_hold() and cache_drop() prevent the premature deletion of a
 809  * namecache entry but do not prevent operations (such as zapping) on
 810  * that namecache entry.
 811  *
 812  * This routine may only be called from outside this source module if
 813  * nc_refs is already at least 1.
 814  *
 815  * This is a rare case where callers are allowed to hold a spinlock,
 816  * so we can't ourselves.
 817  */
 818 static __inline
 819 struct namecache *
 820 _cache_hold(struct namecache *ncp)
 821 {
 822         atomic_add_int(&ncp->nc_refs, 1);
 823         return(ncp);
 824 }
 825
 826 /*
 827  * Drop a cache entry, taking care to deal with races.
 828  *
 829  * For potential 1->0 transitions we must hold the ncp lock to safely
 830  * test its flags.  An unresolved entry with no children must be zapped
 831  * to avoid leaks.
 832  *
 833  * The call to cache_zap() itself will handle all remaining races and
 834  * will decrement the ncp's refs regardless.  If we are resolved or
 835  * have children nc_refs can safely be dropped to 0 without having to
 836  * zap the entry.
 837  *
 838  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
 839  *
 840  * NOTE: cache_zap() may return a non-NULL referenced parent which must
 841  *       be dropped in a loop.
 842  */
 843 static __inline
 844 void
 845 _cache_drop(struct namecache *ncp)
 846 {
 847         int refs;
 848
 849         while (ncp) {
 850                 KKASSERT(ncp->nc_refs > 0);
 851                 refs = ncp->nc_refs;
 852
 853                 if (refs == 1) {
 854                         if (_cache_lock_nonblock(ncp) == 0) {
 855                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 856                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
 857                                     TAILQ_EMPTY(&ncp->nc_list)) {
 858                                         ncp = cache_zap(ncp, 1);
 859                                         continue;
 860                                 }
 861                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
 862                                         _cache_unlock(ncp);
 863                                         break;
 864                                 }
 865                                 _cache_unlock(ncp);
 866                         }
 867                 } else {
 868                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
 869                                 break;
 870                 }
 871                 cpu_pause();
 872         }
 873 }
 874
 875 /*
 876  * Link a new namecache entry to its parent and to the hash table.  Be
 877  * careful to avoid races if vhold() blocks in the future.
 878  *
 879  * Both ncp and par must be referenced and locked.
 880  *
 881  * NOTE: The hash table spinlock is held during this call, we can't do
 882  *       anything fancy.
 883  */
 884 static void
 885 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 886                    struct nchash_head *nchpp)
 887 {
 888         KKASSERT(ncp->nc_parent == NULL);
 889         ncp->nc_parent = par;
 890         ncp->nc_head = nchpp;
 891
 892         /*
 893          * Set inheritance flags.  Note that the parent flags may be
 894          * stale due to getattr potentially not having been run yet
 895          * (it gets run during nlookup()'s).
 896          */
 897         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 898         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 899                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 900         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 901                 ncp->nc_flag |= NCF_UF_PCACHE;
 902
 903         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 904
 905         if (TAILQ_EMPTY(&par->nc_list)) {
 906                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 907                 /*
 908                  * Any vp associated with an ncp which has children must
 909                  * be held to prevent it from being recycled.
 910                  */
 911                 if (par->nc_vp)
 912                         vhold(par->nc_vp);
 913         } else {
 914                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 915         }
 916 }
 917
 918 /*
 919  * Remove the parent and hash associations from a namecache structure.
 920  * If this is the last child of the parent the cache_drop(par) will
 921  * attempt to recursively zap the parent.
 922  *
 923  * ncp must be locked.  This routine will acquire a temporary lock on
 924  * the parent as wlel as the appropriate hash chain.
 925  */
 926 static void
 927 _cache_unlink_parent(struct namecache *ncp)
 928 {
 929         struct namecache *par;
 930         struct vnode *dropvp;
 931
 932         if ((par = ncp->nc_parent) != NULL) {
 933                 KKASSERT(ncp->nc_parent == par);
 934                 _cache_hold(par);
 935                 _cache_lock(par);
 936                 spin_lock(&ncp->nc_head->spin);
 937                 LIST_REMOVE(ncp, nc_hash);
 938                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 939                 dropvp = NULL;
 940                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 941                         dropvp = par->nc_vp;
 942                 spin_unlock(&ncp->nc_head->spin);
 943                 ncp->nc_parent = NULL;
 944                 ncp->nc_head = NULL;
 945                 _cache_unlock(par);
 946                 _cache_drop(par);
 947
 948                 /*
 949                  * We can only safely vdrop with no spinlocks held.
 950                  */
 951                 if (dropvp)
 952                         vdrop(dropvp);
 953         }
 954 }
 955
 956 /*
 957  * Allocate a new namecache structure.  Most of the code does not require
 958  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 959  */
 960 static struct namecache *
 961 cache_alloc(int nlen)
 962 {
 963         struct namecache *ncp;
 964
 965         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 966         if (nlen)
 967                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 968         ncp->nc_nlen = nlen;
 969         ncp->nc_flag = NCF_UNRESOLVED;
 970         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 971         ncp->nc_refs = 1;
 972
 973         TAILQ_INIT(&ncp->nc_list);
 974         _cache_lock(ncp);
 975         return(ncp);
 976 }
 977
 978 /*
 979  * Can only be called for the case where the ncp has never been
 980  * associated with anything (so no spinlocks are needed).
 981  */
 982 static void
 983 _cache_free(struct namecache *ncp)
 984 {
 985         KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
 986         if (ncp->nc_name)
 987                 kfree(ncp->nc_name, M_VFSCACHE);
 988         kfree(ncp, M_VFSCACHE);
 989 }
 990
 991 /*
 992  * [re]initialize a nchandle.
 993  */
 994 void
 995 cache_zero(struct nchandle *nch)
 996 {
 997         nch->ncp = NULL;
 998         nch->mount = NULL;
 999 }
1000
1001 /*
1002  * Ref and deref a namecache structure.
1003  *
1004  * The caller must specify a stable ncp pointer, typically meaning the
1005  * ncp is already referenced but this can also occur indirectly through
1006  * e.g. holding a lock on a direct child.
1007  *
1008  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
1009  *          use read spinlocks here.
1010  */
1011 struct nchandle *
1012 cache_hold(struct nchandle *nch)
1013 {
1014         _cache_hold(nch->ncp);
1015         _cache_mntref(nch->mount);
1016         return(nch);
1017 }
1018
1019 /*
1020  * Create a copy of a namecache handle for an already-referenced
1021  * entry.
1022  */
1023 void
1024 cache_copy(struct nchandle *nch, struct nchandle *target)
1025 {
1026         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1027         struct namecache *ncp;
1028
1029         *target = *nch;
1030         _cache_mntref(target->mount);
1031         ncp = target->ncp;
1032         if (ncp) {
1033                 if (ncp == cache->ncp1) {
1034                         if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL))
1035                                 return;
1036                 }
1037                 if (ncp == cache->ncp2) {
1038                         if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL))
1039                                 return;
1040                 }
1041                 _cache_hold(ncp);
1042         }
1043 }
1044
1045 /*
1046  * Caller wants to copy the current directory, copy it out from our
1047  * pcpu cache if possible (the entire critical path is just two localized
1048  * cmpset ops).  If the pcpu cache has a snapshot at all it will be a
1049  * valid one, so we don't have to lock p->p_fd even though we are loading
1050  * two fields.
1051  *
1052  * This has a limited effect since nlookup must still ref and shlock the
1053  * vnode to check perms.  We do avoid the per-proc spin-lock though, which
1054  * can aid threaded programs.
1055  */
1056 void
1057 cache_copy_ncdir(struct proc *p, struct nchandle *target)
1058 {
1059         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1060
1061         *target = p->p_fd->fd_ncdir;
1062         if (target->ncp == cache->ncdir.ncp &&
1063             target->mount == cache->ncdir.mount) {
1064                 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp,
1065                                       target->ncp, NULL)) {
1066                         if (atomic_cmpset_ptr((void *)&cache->ncdir.mount,
1067                                               target->mount, NULL)) {
1068                                 /* CRITICAL PATH */
1069                                 return;
1070                         }
1071                         _cache_drop(target->ncp);
1072                 }
1073         }
1074         spin_lock_shared(&p->p_fd->fd_spin);
1075         cache_copy(&p->p_fd->fd_ncdir, target);
1076         spin_unlock_shared(&p->p_fd->fd_spin);
1077 }
1078
1079 void
1080 cache_changemount(struct nchandle *nch, struct mount *mp)
1081 {
1082         _cache_mntref(mp);
1083         _cache_mntrel(nch->mount);
1084         nch->mount = mp;
1085 }
1086
1087 void
1088 cache_drop(struct nchandle *nch)
1089 {
1090         _cache_mntrel(nch->mount);
1091         _cache_drop(nch->ncp);
1092         nch->ncp = NULL;
1093         nch->mount = NULL;
1094 }
1095
1096 /*
1097  * Drop the nchandle, but try to cache the ref to avoid global atomic
1098  * ops.  This is typically done on the system root and jail root nchandles.
1099  */
1100 void
1101 cache_drop_and_cache(struct nchandle *nch)
1102 {
1103         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1104         struct namecache *ncp;
1105
1106         _cache_mntrel(nch->mount);
1107         ncp = nch->ncp;
1108         if (cache->ncp1 == NULL) {
1109                 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
1110                 if (ncp == NULL)
1111                         goto done;
1112         }
1113         if (cache->ncp2 == NULL) {
1114                 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
1115                 if (ncp == NULL)
1116                         goto done;
1117         }
1118         if (++cache->iter & 1)
1119                 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
1120         else
1121                 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
1122         if (ncp)
1123                 _cache_drop(ncp);
1124 done:
1125         nch->ncp = NULL;
1126         nch->mount = NULL;
1127 }
1128
1129 /*
1130  * We are dropping what the caller believes is the current directory,
1131  * unconditionally store it in our pcpu cache.  Anything already in
1132  * the cache will be discarded.
1133  */
1134 void
1135 cache_drop_ncdir(struct nchandle *nch)
1136 {
1137         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1138
1139         nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp);
1140         nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount);
1141         if (nch->ncp)
1142                 _cache_drop(nch->ncp);
1143         if (nch->mount)
1144                 _cache_mntrel(nch->mount);
1145         nch->ncp = NULL;
1146         nch->mount = NULL;
1147 }
1148
1149 int
1150 cache_lockstatus(struct nchandle *nch)
1151 {
1152         return(_cache_lockstatus(nch->ncp));
1153 }
1154
1155 void
1156 cache_lock(struct nchandle *nch)
1157 {
1158         _cache_lock(nch->ncp);
1159 }
1160
1161 void
1162 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1163 {
1164         struct namecache *ncp = nch->ncp;
1165
1166         if (ncp_shared_lock_disable || excl ||
1167             (ncp->nc_flag & NCF_UNRESOLVED)) {
1168                 _cache_lock(ncp);
1169         } else {
1170                 _cache_lock_shared(ncp);
1171                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1172                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1173                                 _cache_unlock(ncp);
1174                                 _cache_lock(ncp);
1175                         }
1176                 } else {
1177                         _cache_unlock(ncp);
1178                         _cache_lock(ncp);
1179                 }
1180         }
1181 }
1182
1183 /*
1184  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
1185  * is responsible for checking both for validity on return as they
1186  * may have become invalid.
1187  *
1188  * We have to deal with potential deadlocks here, just ping pong
1189  * the lock until we get it (we will always block somewhere when
1190  * looping so this is not cpu-intensive).
1191  *
1192  * which = 0    nch1 not locked, nch2 is locked
1193  * which = 1    nch1 is locked, nch2 is not locked
1194  */
1195 void
1196 cache_relock(struct nchandle *nch1, struct ucred *cred1,
1197              struct nchandle *nch2, struct ucred *cred2)
1198 {
1199         int which;
1200
1201         which = 0;
1202
1203         for (;;) {
1204                 if (which == 0) {
1205                         if (cache_lock_nonblock(nch1) == 0) {
1206                                 cache_resolve(nch1, cred1);
1207                                 break;
1208                         }
1209                         cache_unlock(nch2);
1210                         cache_lock(nch1);
1211                         cache_resolve(nch1, cred1);
1212                         which = 1;
1213                 } else {
1214                         if (cache_lock_nonblock(nch2) == 0) {
1215                                 cache_resolve(nch2, cred2);
1216                                 break;
1217                         }
1218                         cache_unlock(nch1);
1219                         cache_lock(nch2);
1220                         cache_resolve(nch2, cred2);
1221                         which = 0;
1222                 }
1223         }
1224 }
1225
1226 int
1227 cache_lock_nonblock(struct nchandle *nch)
1228 {
1229         return(_cache_lock_nonblock(nch->ncp));
1230 }
1231
1232 void
1233 cache_unlock(struct nchandle *nch)
1234 {
1235         _cache_unlock(nch->ncp);
1236 }
1237
1238 /*
1239  * ref-and-lock, unlock-and-deref functions.
1240  *
1241  * This function is primarily used by nlookup.  Even though cache_lock
1242  * holds the vnode, it is possible that the vnode may have already
1243  * initiated a recyclement.
1244  *
1245  * We want cache_get() to return a definitively usable vnode or a
1246  * definitively unresolved ncp.
1247  */
1248 static
1249 struct namecache *
1250 _cache_get(struct namecache *ncp)
1251 {
1252         _cache_hold(ncp);
1253         _cache_lock(ncp);
1254         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1255                 _cache_setunresolved(ncp);
1256         return(ncp);
1257 }
1258
1259 /*
1260  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1261  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1262  * valid.  Otherwise an exclusive lock will be acquired instead.
1263  */
1264 static
1265 struct namecache *
1266 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1267 {
1268         if (ncp_shared_lock_disable || excl ||
1269             (ncp->nc_flag & NCF_UNRESOLVED)) {
1270                 return(_cache_get(ncp));
1271         }
1272         _cache_hold(ncp);
1273         _cache_lock_shared(ncp);
1274         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1275                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1276                         _cache_unlock(ncp);
1277                         ncp = _cache_get(ncp);
1278                         _cache_drop(ncp);
1279                 }
1280         } else {
1281                 _cache_unlock(ncp);
1282                 ncp = _cache_get(ncp);
1283                 _cache_drop(ncp);
1284         }
1285         return(ncp);
1286 }
1287
1288 /*
1289  * This is a special form of _cache_lock() which only succeeds if
1290  * it can get a pristine, non-recursive lock.  The caller must have
1291  * already ref'd the ncp.
1292  *
1293  * On success the ncp will be locked, on failure it will not.  The
1294  * ref count does not change either way.
1295  *
1296  * We want _cache_lock_special() (on success) to return a definitively
1297  * usable vnode or a definitively unresolved ncp.
1298  */
1299 static int
1300 _cache_lock_special(struct namecache *ncp)
1301 {
1302         if (_cache_lock_nonblock(ncp) == 0) {
1303                 if ((ncp->nc_lockstatus &
1304                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
1305                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1306                                 _cache_setunresolved(ncp);
1307                         return(0);
1308                 }
1309                 _cache_unlock(ncp);
1310         }
1311         return(EWOULDBLOCK);
1312 }
1313
1314 /*
1315  * This function tries to get a shared lock but will back-off to an exclusive
1316  * lock if:
1317  *
1318  * (1) Some other thread is trying to obtain an exclusive lock
1319  *     (to prevent the exclusive requester from getting livelocked out
1320  *     by many shared locks).
1321  *
1322  * (2) The current thread already owns an exclusive lock (to avoid
1323  *     deadlocking).
1324  *
1325  * WARNING! On machines with lots of cores we really want to try hard to
1326  *          get a shared lock or concurrent path lookups can chain-react
1327  *          into a very high-latency exclusive lock.
1328  */
1329 static int
1330 _cache_lock_shared_special(struct namecache *ncp)
1331 {
1332         /*
1333          * Only honor a successful shared lock (returning 0) if there is
1334          * no exclusive request pending and the vnode, if present, is not
1335          * in a reclaimed state.
1336          */
1337         if (_cache_lock_shared_nonblock(ncp) == 0) {
1338                 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) {
1339                         if (ncp->nc_vp == NULL ||
1340                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
1341                                 return(0);
1342                         }
1343                 }
1344                 _cache_unlock(ncp);
1345                 return(EWOULDBLOCK);
1346         }
1347
1348         /*
1349          * Non-blocking shared lock failed.  If we already own the exclusive
1350          * lock just acquire another exclusive lock (instead of deadlocking).
1351          * Otherwise acquire a shared lock.
1352          */
1353         if (ncp->nc_locktd == curthread) {
1354                 _cache_lock(ncp);
1355                 return(0);
1356         }
1357         _cache_lock_shared(ncp);
1358         return(0);
1359 }
1360
1361
1362 /*
1363  * NOTE: The same nchandle can be passed for both arguments.
1364  */
1365 void
1366 cache_get(struct nchandle *nch, struct nchandle *target)
1367 {
1368         KKASSERT(nch->ncp->nc_refs > 0);
1369         target->mount = nch->mount;
1370         target->ncp = _cache_get(nch->ncp);
1371         _cache_mntref(target->mount);
1372 }
1373
1374 void
1375 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1376 {
1377         KKASSERT(nch->ncp->nc_refs > 0);
1378         target->mount = nch->mount;
1379         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1380         _cache_mntref(target->mount);
1381 }
1382
1383 /*
1384  *
1385  */
1386 static __inline
1387 void
1388 _cache_put(struct namecache *ncp)
1389 {
1390         _cache_unlock(ncp);
1391         _cache_drop(ncp);
1392 }
1393
1394 /*
1395  *
1396  */
1397 void
1398 cache_put(struct nchandle *nch)
1399 {
1400         _cache_mntrel(nch->mount);
1401         _cache_put(nch->ncp);
1402         nch->ncp = NULL;
1403         nch->mount = NULL;
1404 }
1405
1406 /*
1407  * Resolve an unresolved ncp by associating a vnode with it.  If the
1408  * vnode is NULL, a negative cache entry is created.
1409  *
1410  * The ncp should be locked on entry and will remain locked on return.
1411  */
1412 static
1413 void
1414 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1415 {
1416         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
1417         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1418
1419         if (vp != NULL) {
1420                 /*
1421                  * Any vp associated with an ncp which has children must
1422                  * be held.  Any vp associated with a locked ncp must be held.
1423                  */
1424                 if (!TAILQ_EMPTY(&ncp->nc_list))
1425                         vhold(vp);
1426                 spin_lock(&vp->v_spin);
1427                 ncp->nc_vp = vp;
1428                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1429                 spin_unlock(&vp->v_spin);
1430                 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1431                         vhold(vp);
1432
1433                 /*
1434                  * Set auxiliary flags
1435                  */
1436                 switch(vp->v_type) {
1437                 case VDIR:
1438                         ncp->nc_flag |= NCF_ISDIR;
1439                         break;
1440                 case VLNK:
1441                         ncp->nc_flag |= NCF_ISSYMLINK;
1442                         /* XXX cache the contents of the symlink */
1443                         break;
1444                 default:
1445                         break;
1446                 }
1447                 atomic_add_int(&numcache, 1);
1448                 ncp->nc_error = 0;
1449                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
1450                  * implementation*/
1451                 if (mp != NULL)
1452                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1453                                 vp->v_pfsmp = mp;
1454         } else {
1455                 /*
1456                  * When creating a negative cache hit we set the
1457                  * namecache_gen.  A later resolve will clean out the
1458                  * negative cache hit if the mount point's namecache_gen
1459                  * has changed.  Used by devfs, could also be used by
1460                  * other remote FSs.
1461                  */
1462                 ncp->nc_vp = NULL;
1463                 spin_lock(&ncspin);
1464                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
1465                 ++numneg;
1466                 spin_unlock(&ncspin);
1467                 ncp->nc_error = ENOENT;
1468                 if (mp)
1469                         VFS_NCPGEN_SET(mp, ncp);
1470         }
1471         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1472 }
1473
1474 /*
1475  *
1476  */
1477 void
1478 cache_setvp(struct nchandle *nch, struct vnode *vp)
1479 {
1480         _cache_setvp(nch->mount, nch->ncp, vp);
1481 }
1482
1483 /*
1484  *
1485  */
1486 void
1487 cache_settimeout(struct nchandle *nch, int nticks)
1488 {
1489         struct namecache *ncp = nch->ncp;
1490
1491         if ((ncp->nc_timeout = ticks + nticks) == 0)
1492                 ncp->nc_timeout = 1;
1493 }
1494
1495 /*
1496  * Disassociate the vnode or negative-cache association and mark a
1497  * namecache entry as unresolved again.  Note that the ncp is still
1498  * left in the hash table and still linked to its parent.
1499  *
1500  * The ncp should be locked and refd on entry and will remain locked and refd
1501  * on return.
1502  *
1503  * This routine is normally never called on a directory containing children.
1504  * However, NFS often does just that in its rename() code as a cop-out to
1505  * avoid complex namespace operations.  This disconnects a directory vnode
1506  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1507  * sync.
1508  *
1509  */
1510 static
1511 void
1512 _cache_setunresolved(struct namecache *ncp)
1513 {
1514         struct vnode *vp;
1515
1516         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1517                 ncp->nc_flag |= NCF_UNRESOLVED;
1518                 ncp->nc_timeout = 0;
1519                 ncp->nc_error = ENOTCONN;
1520                 if ((vp = ncp->nc_vp) != NULL) {
1521                         atomic_add_int(&numcache, -1);
1522                         spin_lock(&vp->v_spin);
1523                         ncp->nc_vp = NULL;
1524                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1525                         spin_unlock(&vp->v_spin);
1526
1527                         /*
1528                          * Any vp associated with an ncp with children is
1529                          * held by that ncp.  Any vp associated with a locked
1530                          * ncp is held by that ncp.  These conditions must be
1531                          * undone when the vp is cleared out from the ncp.
1532                          */
1533                         if (!TAILQ_EMPTY(&ncp->nc_list))
1534                                 vdrop(vp);
1535                         if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1536                                 vdrop(vp);
1537                 } else {
1538                         spin_lock(&ncspin);
1539                         TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
1540                         --numneg;
1541                         spin_unlock(&ncspin);
1542                 }
1543                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1544         }
1545 }
1546
1547 /*
1548  * The cache_nresolve() code calls this function to automatically
1549  * set a resolved cache element to unresolved if it has timed out
1550  * or if it is a negative cache hit and the mount point namecache_gen
1551  * has changed.
1552  */
1553 static __inline int
1554 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1555 {
1556         /*
1557          * Try to zap entries that have timed out.  We have
1558          * to be careful here because locked leafs may depend
1559          * on the vnode remaining intact in a parent, so only
1560          * do this under very specific conditions.
1561          */
1562         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1563             TAILQ_EMPTY(&ncp->nc_list)) {
1564                 return 1;
1565         }
1566
1567         /*
1568          * If a resolved negative cache hit is invalid due to
1569          * the mount's namecache generation being bumped, zap it.
1570          */
1571         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1572                 return 1;
1573         }
1574
1575         /*
1576          * Otherwise we are good
1577          */
1578         return 0;
1579 }
1580
1581 static __inline void
1582 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1583 {
1584         /*
1585          * Already in an unresolved state, nothing to do.
1586          */
1587         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1588                 if (_cache_auto_unresolve_test(mp, ncp))
1589                         _cache_setunresolved(ncp);
1590         }
1591 }
1592
1593 /*
1594  *
1595  */
1596 void
1597 cache_setunresolved(struct nchandle *nch)
1598 {
1599         _cache_setunresolved(nch->ncp);
1600 }
1601
1602 /*
1603  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1604  * looking for matches.  This flag tells the lookup code when it must
1605  * check for a mount linkage and also prevents the directories in question
1606  * from being deleted or renamed.
1607  */
1608 static
1609 int
1610 cache_clrmountpt_callback(struct mount *mp, void *data)
1611 {
1612         struct nchandle *nch = data;
1613
1614         if (mp->mnt_ncmounton.ncp == nch->ncp)
1615                 return(1);
1616         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1617                 return(1);
1618         return(0);
1619 }
1620
1621 /*
1622  *
1623  */
1624 void
1625 cache_clrmountpt(struct nchandle *nch)
1626 {
1627         int count;
1628
1629         count = mountlist_scan(cache_clrmountpt_callback, nch,
1630                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1631         if (count == 0)
1632                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1633 }
1634
1635 /*
1636  * Invalidate portions of the namecache topology given a starting entry.
1637  * The passed ncp is set to an unresolved state and:
1638  *
1639  * The passed ncp must be referencxed and locked.  The routine may unlock
1640  * and relock ncp several times, and will recheck the children and loop
1641  * to catch races.  When done the passed ncp will be returned with the
1642  * reference and lock intact.
1643  *
1644  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1645  *                        that the physical underlying nodes have been
1646  *                        destroyed... as in deleted.  For example, when
1647  *                        a directory is removed.  This will cause record
1648  *                        lookups on the name to no longer be able to find
1649  *                        the record and tells the resolver to return failure
1650  *                        rather then trying to resolve through the parent.
1651  *
1652  *                        The topology itself, including ncp->nc_name,
1653  *                        remains intact.
1654  *
1655  *                        This only applies to the passed ncp, if CINV_CHILDREN
1656  *                        is specified the children are not flagged.
1657  *
1658  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1659  *                        state as well.
1660  *
1661  *                        Note that this will also have the side effect of
1662  *                        cleaning out any unreferenced nodes in the topology
1663  *                        from the leaves up as the recursion backs out.
1664  *
1665  * Note that the topology for any referenced nodes remains intact, but
1666  * the nodes will be marked as having been destroyed and will be set
1667  * to an unresolved state.
1668  *
1669  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1670  * the namecache entry may not actually be invalidated on return if it was
1671  * revalidated while recursing down into its children.  This code guarentees
1672  * that the node(s) will go through an invalidation cycle, but does not
1673  * guarentee that they will remain in an invalidated state.
1674  *
1675  * Returns non-zero if a revalidation was detected during the invalidation
1676  * recursion, zero otherwise.  Note that since only the original ncp is
1677  * locked the revalidation ultimately can only indicate that the original ncp
1678  * *MIGHT* no have been reresolved.
1679  *
1680  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1681  * have to avoid blowing out the kernel stack.  We do this by saving the
1682  * deep namecache node and aborting the recursion, then re-recursing at that
1683  * node using a depth-first algorithm in order to allow multiple deep
1684  * recursions to chain through each other, then we restart the invalidation
1685  * from scratch.
1686  */
1687
1688 struct cinvtrack {
1689         struct namecache *resume_ncp;
1690         int depth;
1691 };
1692
1693 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1694
1695 static
1696 int
1697 _cache_inval(struct namecache *ncp, int flags)
1698 {
1699         struct cinvtrack track;
1700         struct namecache *ncp2;
1701         int r;
1702
1703         track.depth = 0;
1704         track.resume_ncp = NULL;
1705
1706         for (;;) {
1707                 r = _cache_inval_internal(ncp, flags, &track);
1708                 if (track.resume_ncp == NULL)
1709                         break;
1710                 _cache_unlock(ncp);
1711                 while ((ncp2 = track.resume_ncp) != NULL) {
1712                         track.resume_ncp = NULL;
1713                         _cache_lock(ncp2);
1714                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1715                                              &track);
1716                         _cache_put(ncp2);
1717                 }
1718                 _cache_lock(ncp);
1719         }
1720         return(r);
1721 }
1722
1723 int
1724 cache_inval(struct nchandle *nch, int flags)
1725 {
1726         return(_cache_inval(nch->ncp, flags));
1727 }
1728
1729 /*
1730  * Helper for _cache_inval().  The passed ncp is refd and locked and
1731  * remains that way on return, but may be unlocked/relocked multiple
1732  * times by the routine.
1733  */
1734 static int
1735 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1736 {
1737         struct namecache *nextkid;
1738         int rcnt = 0;
1739
1740         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1741
1742         _cache_setunresolved(ncp);
1743         if (flags & CINV_DESTROY) {
1744                 ncp->nc_flag |= NCF_DESTROYED;
1745                 ++ncp->nc_generation;
1746         }
1747         while ((flags & CINV_CHILDREN) &&
1748                (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1749         ) {
1750                 struct namecache *kid;
1751                 int restart;
1752
1753                 restart = 0;
1754                 _cache_hold(nextkid);
1755                 if (++track->depth > MAX_RECURSION_DEPTH) {
1756                         track->resume_ncp = ncp;
1757                         _cache_hold(ncp);
1758                         ++rcnt;
1759                 }
1760                 while ((kid = nextkid) != NULL) {
1761                         /*
1762                          * Parent (ncp) must be locked for the iteration.
1763                          */
1764                         nextkid = NULL;
1765                         if (kid->nc_parent != ncp) {
1766                                 _cache_drop(kid);
1767                                 kprintf("cache_inval_internal restartA %s\n",
1768                                         ncp->nc_name);
1769                                 restart = 1;
1770                                 break;
1771                         }
1772                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1773                                 _cache_hold(nextkid);
1774
1775                         /*
1776                          * Parent unlocked for this section to avoid
1777                          * deadlocks.
1778                          */
1779                         _cache_unlock(ncp);
1780                         if (track->resume_ncp) {
1781                                 _cache_drop(kid);
1782                                 _cache_lock(ncp);
1783                                 break;
1784                         }
1785                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1786                             TAILQ_FIRST(&kid->nc_list)
1787                         ) {
1788                                 _cache_lock(kid);
1789                                 if (kid->nc_parent != ncp) {
1790                                         kprintf("cache_inval_internal "
1791                                                 "restartB %s\n",
1792                                                 ncp->nc_name);
1793                                         restart = 1;
1794                                         _cache_unlock(kid);
1795                                         _cache_drop(kid);
1796                                         _cache_lock(ncp);
1797                                         break;
1798                                 }
1799
1800                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1801                                 _cache_unlock(kid);
1802                         }
1803                         _cache_drop(kid);
1804                         _cache_lock(ncp);
1805                 }
1806                 if (nextkid)
1807                         _cache_drop(nextkid);
1808                 --track->depth;
1809                 if (restart == 0)
1810                         break;
1811         }
1812
1813         /*
1814          * Someone could have gotten in there while ncp was unlocked,
1815          * retry if so.
1816          */
1817         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1818                 ++rcnt;
1819         return (rcnt);
1820 }
1821
1822 /*
1823  * Invalidate a vnode's namecache associations.  To avoid races against
1824  * the resolver we do not invalidate a node which we previously invalidated
1825  * but which was then re-resolved while we were in the invalidation loop.
1826  *
1827  * Returns non-zero if any namecache entries remain after the invalidation
1828  * loop completed.
1829  *
1830  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1831  *       be ripped out of the topology while held, the vnode's v_namecache
1832  *       list has no such restriction.  NCP's can be ripped out of the list
1833  *       at virtually any time if not locked, even if held.
1834  *
1835  *       In addition, the v_namecache list itself must be locked via
1836  *       the vnode's spinlock.
1837  */
1838 int
1839 cache_inval_vp(struct vnode *vp, int flags)
1840 {
1841         struct namecache *ncp;
1842         struct namecache *next;
1843
1844 restart:
1845         spin_lock(&vp->v_spin);
1846         ncp = TAILQ_FIRST(&vp->v_namecache);
1847         if (ncp)
1848                 _cache_hold(ncp);
1849         while (ncp) {
1850                 /* loop entered with ncp held and vp spin-locked */
1851                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1852                         _cache_hold(next);
1853                 spin_unlock(&vp->v_spin);
1854                 _cache_lock(ncp);
1855                 if (ncp->nc_vp != vp) {
1856                         kprintf("Warning: cache_inval_vp: race-A detected on "
1857                                 "%s\n", ncp->nc_name);
1858                         _cache_put(ncp);
1859                         if (next)
1860                                 _cache_drop(next);
1861                         goto restart;
1862                 }
1863                 _cache_inval(ncp, flags);
1864                 _cache_put(ncp);                /* also releases reference */
1865                 ncp = next;
1866                 spin_lock(&vp->v_spin);
1867                 if (ncp && ncp->nc_vp != vp) {
1868                         spin_unlock(&vp->v_spin);
1869                         kprintf("Warning: cache_inval_vp: race-B detected on "
1870                                 "%s\n", ncp->nc_name);
1871                         _cache_drop(ncp);
1872                         goto restart;
1873                 }
1874         }
1875         spin_unlock(&vp->v_spin);
1876         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1877 }
1878
1879 /*
1880  * This routine is used instead of the normal cache_inval_vp() when we
1881  * are trying to recycle otherwise good vnodes.
1882  *
1883  * Return 0 on success, non-zero if not all namecache records could be
1884  * disassociated from the vnode (for various reasons).
1885  */
1886 int
1887 cache_inval_vp_nonblock(struct vnode *vp)
1888 {
1889         struct namecache *ncp;
1890         struct namecache *next;
1891
1892         spin_lock(&vp->v_spin);
1893         ncp = TAILQ_FIRST(&vp->v_namecache);
1894         if (ncp)
1895                 _cache_hold(ncp);
1896         while (ncp) {
1897                 /* loop entered with ncp held */
1898                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1899                         _cache_hold(next);
1900                 spin_unlock(&vp->v_spin);
1901                 if (_cache_lock_nonblock(ncp)) {
1902                         _cache_drop(ncp);
1903                         if (next)
1904                                 _cache_drop(next);
1905                         goto done;
1906                 }
1907                 if (ncp->nc_vp != vp) {
1908                         kprintf("Warning: cache_inval_vp: race-A detected on "
1909                                 "%s\n", ncp->nc_name);
1910                         _cache_put(ncp);
1911                         if (next)
1912                                 _cache_drop(next);
1913                         goto done;
1914                 }
1915                 _cache_inval(ncp, 0);
1916                 _cache_put(ncp);                /* also releases reference */
1917                 ncp = next;
1918                 spin_lock(&vp->v_spin);
1919                 if (ncp && ncp->nc_vp != vp) {
1920                         spin_unlock(&vp->v_spin);
1921                         kprintf("Warning: cache_inval_vp: race-B detected on "
1922                                 "%s\n", ncp->nc_name);
1923                         _cache_drop(ncp);
1924                         goto done;
1925                 }
1926         }
1927         spin_unlock(&vp->v_spin);
1928 done:
1929         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1930 }
1931
1932 /*
1933  * Clears the universal directory search 'ok' flag.  This flag allows
1934  * nlookup() to bypass normal vnode checks.  This flag is a cached flag
1935  * so clearing it simply forces revalidation.
1936  */
1937 void
1938 cache_inval_wxok(struct vnode *vp)
1939 {
1940         struct namecache *ncp;
1941
1942         spin_lock(&vp->v_spin);
1943         TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1944                 if (ncp->nc_flag & NCF_WXOK)
1945                         atomic_clear_short(&ncp->nc_flag, NCF_WXOK);
1946         }
1947         spin_unlock(&vp->v_spin);
1948 }
1949
1950 /*
1951  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1952  * must be locked.  The target ncp is destroyed (as a normal rename-over
1953  * would destroy the target file or directory).
1954  *
1955  * Because there may be references to the source ncp we cannot copy its
1956  * contents to the target.  Instead the source ncp is relinked as the target
1957  * and the target ncp is removed from the namecache topology.
1958  */
1959 void
1960 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1961 {
1962         struct namecache *fncp = fnch->ncp;
1963         struct namecache *tncp = tnch->ncp;
1964         struct namecache *tncp_par;
1965         struct nchash_head *nchpp;
1966         u_int32_t hash;
1967         char *oname;
1968         char *nname;
1969
1970         ++fncp->nc_generation;
1971         ++tncp->nc_generation;
1972         if (tncp->nc_nlen) {
1973                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1974                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1975                 nname[tncp->nc_nlen] = 0;
1976         } else {
1977                 nname = NULL;
1978         }
1979
1980         /*
1981          * Rename fncp (unlink)
1982          */
1983         _cache_unlink_parent(fncp);
1984         oname = fncp->nc_name;
1985         fncp->nc_name = nname;
1986         fncp->nc_nlen = tncp->nc_nlen;
1987         if (oname)
1988                 kfree(oname, M_VFSCACHE);
1989
1990         tncp_par = tncp->nc_parent;
1991         _cache_hold(tncp_par);
1992         _cache_lock(tncp_par);
1993
1994         /*
1995          * Rename fncp (relink)
1996          */
1997         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1998         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1999         nchpp = NCHHASH(hash);
2000
2001         spin_lock(&nchpp->spin);
2002         _cache_link_parent(fncp, tncp_par, nchpp);
2003         spin_unlock(&nchpp->spin);
2004
2005         _cache_put(tncp_par);
2006
2007         /*
2008          * Get rid of the overwritten tncp (unlink)
2009          */
2010         _cache_unlink(tncp);
2011 }
2012
2013 /*
2014  * Perform actions consistent with unlinking a file.  The passed-in ncp
2015  * must be locked.
2016  *
2017  * The ncp is marked DESTROYED so it no longer shows up in searches,
2018  * and will be physically deleted when the vnode goes away.
2019  *
2020  * If the related vnode has no refs then we cycle it through vget()/vput()
2021  * to (possibly if we don't have a ref race) trigger a deactivation,
2022  * allowing the VFS to trivially detect and recycle the deleted vnode
2023  * via VOP_INACTIVE().
2024  *
2025  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
2026  *       target ncp.
2027  */
2028 void
2029 cache_unlink(struct nchandle *nch)
2030 {
2031         _cache_unlink(nch->ncp);
2032 }
2033
2034 static void
2035 _cache_unlink(struct namecache *ncp)
2036 {
2037         struct vnode *vp;
2038
2039         /*
2040          * Causes lookups to fail and allows another ncp with the same
2041          * name to be created under ncp->nc_parent.
2042          */
2043         ncp->nc_flag |= NCF_DESTROYED;
2044         ++ncp->nc_generation;
2045
2046         /*
2047          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
2048          * force action on the 1->0 transition.
2049          */
2050         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2051             (vp = ncp->nc_vp) != NULL) {
2052                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
2053                 if (VREFCNT(vp) <= 0) {
2054                         if (vget(vp, LK_SHARED) == 0)
2055                                 vput(vp);
2056                 }
2057         }
2058 }
2059
2060 /*
2061  * Return non-zero if the nch might be associated with an open and/or mmap()'d
2062  * file.  The easy solution is to just return non-zero if the vnode has refs.
2063  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
2064  * force the reclaim).
2065  */
2066 int
2067 cache_isopen(struct nchandle *nch)
2068 {
2069         struct vnode *vp;
2070         struct namecache *ncp = nch->ncp;
2071
2072         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2073             (vp = ncp->nc_vp) != NULL &&
2074             VREFCNT(vp)) {
2075                 return 1;
2076         }
2077         return 0;
2078 }
2079
2080
2081 /*
2082  * vget the vnode associated with the namecache entry.  Resolve the namecache
2083  * entry if necessary.  The passed ncp must be referenced and locked.  If
2084  * the ncp is resolved it might be locked shared.
2085  *
2086  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
2087  * (depending on the passed lk_type) will be returned in *vpp with an error
2088  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
2089  * most typical error is ENOENT, meaning that the ncp represents a negative
2090  * cache hit and there is no vnode to retrieve, but other errors can occur
2091  * too.
2092  *
2093  * The vget() can race a reclaim.  If this occurs we re-resolve the
2094  * namecache entry.
2095  *
2096  * There are numerous places in the kernel where vget() is called on a
2097  * vnode while one or more of its namecache entries is locked.  Releasing
2098  * a vnode never deadlocks against locked namecache entries (the vnode
2099  * will not get recycled while referenced ncp's exist).  This means we
2100  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
2101  * lock when acquiring the vp lock or we might cause a deadlock.
2102  *
2103  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2104  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2105  *       relocked exclusively before being re-resolved.
2106  */
2107 int
2108 cache_vget(struct nchandle *nch, struct ucred *cred,
2109            int lk_type, struct vnode **vpp)
2110 {
2111         struct namecache *ncp;
2112         struct vnode *vp;
2113         int error;
2114
2115         ncp = nch->ncp;
2116 again:
2117         vp = NULL;
2118         if (ncp->nc_flag & NCF_UNRESOLVED)
2119                 error = cache_resolve(nch, cred);
2120         else
2121                 error = 0;
2122
2123         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2124                 error = vget(vp, lk_type);
2125                 if (error) {
2126                         /*
2127                          * VRECLAIM race
2128                          *
2129                          * The ncp may have been locked shared, we must relock
2130                          * it exclusively before we can set it to unresolved.
2131                          */
2132                         if (error == ENOENT) {
2133                                 kprintf("Warning: vnode reclaim race detected "
2134                                         "in cache_vget on %p (%s)\n",
2135                                         vp, ncp->nc_name);
2136                                 _cache_unlock(ncp);
2137                                 _cache_lock(ncp);
2138                                 _cache_setunresolved(ncp);
2139                                 goto again;
2140                         }
2141
2142                         /*
2143                          * Not a reclaim race, some other error.
2144                          */
2145                         KKASSERT(ncp->nc_vp == vp);
2146                         vp = NULL;
2147                 } else {
2148                         KKASSERT(ncp->nc_vp == vp);
2149                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2150                 }
2151         }
2152         if (error == 0 && vp == NULL)
2153                 error = ENOENT;
2154         *vpp = vp;
2155         return(error);
2156 }
2157
2158 /*
2159  * Similar to cache_vget() but only acquires a ref on the vnode.
2160  *
2161  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2162  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2163  *       relocked exclusively before being re-resolved.
2164  */
2165 int
2166 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
2167 {
2168         struct namecache *ncp;
2169         struct vnode *vp;
2170         int error;
2171
2172         ncp = nch->ncp;
2173 again:
2174         vp = NULL;
2175         if (ncp->nc_flag & NCF_UNRESOLVED)
2176                 error = cache_resolve(nch, cred);
2177         else
2178                 error = 0;
2179
2180         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2181                 error = vget(vp, LK_SHARED);
2182                 if (error) {
2183                         /*
2184                          * VRECLAIM race
2185                          */
2186                         if (error == ENOENT) {
2187                                 kprintf("Warning: vnode reclaim race detected "
2188                                         "in cache_vget on %p (%s)\n",
2189                                         vp, ncp->nc_name);
2190                                 _cache_unlock(ncp);
2191                                 _cache_lock(ncp);
2192                                 _cache_setunresolved(ncp);
2193                                 goto again;
2194                         }
2195
2196                         /*
2197                          * Not a reclaim race, some other error.
2198                          */
2199                         KKASSERT(ncp->nc_vp == vp);
2200                         vp = NULL;
2201                 } else {
2202                         KKASSERT(ncp->nc_vp == vp);
2203                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2204                         /* caller does not want a lock */
2205                         vn_unlock(vp);
2206                 }
2207         }
2208         if (error == 0 && vp == NULL)
2209                 error = ENOENT;
2210         *vpp = vp;
2211         return(error);
2212 }
2213
2214 /*
2215  * Return a referenced vnode representing the parent directory of
2216  * ncp.
2217  *
2218  * Because the caller has locked the ncp it should not be possible for
2219  * the parent ncp to go away.  However, the parent can unresolve its
2220  * dvp at any time so we must be able to acquire a lock on the parent
2221  * to safely access nc_vp.
2222  *
2223  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2224  * so use vhold()/vdrop() while holding the lock to prevent dvp from
2225  * getting destroyed.
2226  *
2227  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2228  *       lock on the ncp in question..
2229  */
2230 static struct vnode *
2231 cache_dvpref(struct namecache *ncp)
2232 {
2233         struct namecache *par;
2234         struct vnode *dvp;
2235
2236         dvp = NULL;
2237         if ((par = ncp->nc_parent) != NULL) {
2238                 _cache_hold(par);
2239                 _cache_lock(par);
2240                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2241                         if ((dvp = par->nc_vp) != NULL)
2242                                 vhold(dvp);
2243                 }
2244                 _cache_unlock(par);
2245                 if (dvp) {
2246                         if (vget(dvp, LK_SHARED) == 0) {
2247                                 vn_unlock(dvp);
2248                                 vdrop(dvp);
2249                                 /* return refd, unlocked dvp */
2250                         } else {
2251                                 vdrop(dvp);
2252                                 dvp = NULL;
2253                         }
2254                 }
2255                 _cache_drop(par);
2256         }
2257         return(dvp);
2258 }
2259
2260 /*
2261  * Convert a directory vnode to a namecache record without any other
2262  * knowledge of the topology.  This ONLY works with directory vnodes and
2263  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2264  * returned ncp (if not NULL) will be held and unlocked.
2265  *
2266  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2267  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2268  * for dvp.  This will fail only if the directory has been deleted out from
2269  * under the caller.
2270  *
2271  * Callers must always check for a NULL return no matter the value of 'makeit'.
2272  *
2273  * To avoid underflowing the kernel stack each recursive call increments
2274  * the makeit variable.
2275  */
2276
2277 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2278                                   struct vnode *dvp, char *fakename);
2279 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2280                                   struct vnode **saved_dvp);
2281
2282 int
2283 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2284               struct nchandle *nch)
2285 {
2286         struct vnode *saved_dvp;
2287         struct vnode *pvp;
2288         char *fakename;
2289         int error;
2290
2291         nch->ncp = NULL;
2292         nch->mount = dvp->v_mount;
2293         saved_dvp = NULL;
2294         fakename = NULL;
2295
2296         /*
2297          * Handle the makeit == 0 degenerate case
2298          */
2299         if (makeit == 0) {
2300                 spin_lock_shared(&dvp->v_spin);
2301                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2302                 if (nch->ncp)
2303                         cache_hold(nch);
2304                 spin_unlock_shared(&dvp->v_spin);
2305         }
2306
2307         /*
2308          * Loop until resolution, inside code will break out on error.
2309          */
2310         while (makeit) {
2311                 /*
2312                  * Break out if we successfully acquire a working ncp.
2313                  */
2314                 spin_lock_shared(&dvp->v_spin);
2315                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2316                 if (nch->ncp) {
2317                         cache_hold(nch);
2318                         spin_unlock_shared(&dvp->v_spin);
2319                         break;
2320                 }
2321                 spin_unlock_shared(&dvp->v_spin);
2322
2323                 /*
2324                  * If dvp is the root of its filesystem it should already
2325                  * have a namecache pointer associated with it as a side
2326                  * effect of the mount, but it may have been disassociated.
2327                  */
2328                 if (dvp->v_flag & VROOT) {
2329                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2330                         error = cache_resolve_mp(nch->mount);
2331                         _cache_put(nch->ncp);
2332                         if (ncvp_debug) {
2333                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2334                                         dvp->v_mount, error);
2335                         }
2336                         if (error) {
2337                                 if (ncvp_debug)
2338                                         kprintf(" failed\n");
2339                                 nch->ncp = NULL;
2340                                 break;
2341                         }
2342                         if (ncvp_debug)
2343                                 kprintf(" succeeded\n");
2344                         continue;
2345                 }
2346
2347                 /*
2348                  * If we are recursed too deeply resort to an O(n^2)
2349                  * algorithm to resolve the namecache topology.  The
2350                  * resolved pvp is left referenced in saved_dvp to
2351                  * prevent the tree from being destroyed while we loop.
2352                  */
2353                 if (makeit > 20) {
2354                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2355                         if (error) {
2356                                 kprintf("lookupdotdot(longpath) failed %d "
2357                                        "dvp %p\n", error, dvp);
2358                                 nch->ncp = NULL;
2359                                 break;
2360                         }
2361                         continue;
2362                 }
2363
2364                 /*
2365                  * Get the parent directory and resolve its ncp.
2366                  */
2367                 if (fakename) {
2368                         kfree(fakename, M_TEMP);
2369                         fakename = NULL;
2370                 }
2371                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2372                                           &fakename);
2373                 if (error) {
2374                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2375                         break;
2376                 }
2377                 vn_unlock(pvp);
2378
2379                 /*
2380                  * Reuse makeit as a recursion depth counter.  On success
2381                  * nch will be fully referenced.
2382                  */
2383                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2384                 vrele(pvp);
2385                 if (nch->ncp == NULL)
2386                         break;
2387
2388                 /*
2389                  * Do an inefficient scan of pvp (embodied by ncp) to look
2390                  * for dvp.  This will create a namecache record for dvp on
2391                  * success.  We loop up to recheck on success.
2392                  *
2393                  * ncp and dvp are both held but not locked.
2394                  */
2395                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2396                 if (error) {
2397                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2398                                 pvp, nch->ncp->nc_name, dvp);
2399                         cache_drop(nch);
2400                         /* nch was NULLed out, reload mount */
2401                         nch->mount = dvp->v_mount;
2402                         break;
2403                 }
2404                 if (ncvp_debug) {
2405                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2406                                 pvp, nch->ncp->nc_name);
2407                 }
2408                 cache_drop(nch);
2409                 /* nch was NULLed out, reload mount */
2410                 nch->mount = dvp->v_mount;
2411         }
2412
2413         /*
2414          * If nch->ncp is non-NULL it will have been held already.
2415          */
2416         if (fakename)
2417                 kfree(fakename, M_TEMP);
2418         if (saved_dvp)
2419                 vrele(saved_dvp);
2420         if (nch->ncp)
2421                 return (0);
2422         return (EINVAL);
2423 }
2424
2425 /*
2426  * Go up the chain of parent directories until we find something
2427  * we can resolve into the namecache.  This is very inefficient.
2428  */
2429 static
2430 int
2431 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2432                   struct vnode **saved_dvp)
2433 {
2434         struct nchandle nch;
2435         struct vnode *pvp;
2436         int error;
2437         static time_t last_fromdvp_report;
2438         char *fakename;
2439
2440         /*
2441          * Loop getting the parent directory vnode until we get something we
2442          * can resolve in the namecache.
2443          */
2444         vref(dvp);
2445         nch.mount = dvp->v_mount;
2446         nch.ncp = NULL;
2447         fakename = NULL;
2448
2449         for (;;) {
2450                 if (fakename) {
2451                         kfree(fakename, M_TEMP);
2452                         fakename = NULL;
2453                 }
2454                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2455                                           &fakename);
2456                 if (error) {
2457                         vrele(dvp);
2458                         break;
2459                 }
2460                 vn_unlock(pvp);
2461                 spin_lock_shared(&pvp->v_spin);
2462                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2463                         _cache_hold(nch.ncp);
2464                         spin_unlock_shared(&pvp->v_spin);
2465                         vrele(pvp);
2466                         break;
2467                 }
2468                 spin_unlock_shared(&pvp->v_spin);
2469                 if (pvp->v_flag & VROOT) {
2470                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2471                         error = cache_resolve_mp(nch.mount);
2472                         _cache_unlock(nch.ncp);
2473                         vrele(pvp);
2474                         if (error) {
2475                                 _cache_drop(nch.ncp);
2476                                 nch.ncp = NULL;
2477                                 vrele(dvp);
2478                         }
2479                         break;
2480                 }
2481                 vrele(dvp);
2482                 dvp = pvp;
2483         }
2484         if (error == 0) {
2485                 if (last_fromdvp_report != time_uptime) {
2486                         last_fromdvp_report = time_uptime;
2487                         kprintf("Warning: extremely inefficient path "
2488                                 "resolution on %s\n",
2489                                 nch.ncp->nc_name);
2490                 }
2491                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2492
2493                 /*
2494                  * Hopefully dvp now has a namecache record associated with
2495                  * it.  Leave it referenced to prevent the kernel from
2496                  * recycling the vnode.  Otherwise extremely long directory
2497                  * paths could result in endless recycling.
2498                  */
2499                 if (*saved_dvp)
2500                     vrele(*saved_dvp);
2501                 *saved_dvp = dvp;
2502                 _cache_drop(nch.ncp);
2503         }
2504         if (fakename)
2505                 kfree(fakename, M_TEMP);
2506         return (error);
2507 }
2508
2509 /*
2510  * Do an inefficient scan of the directory represented by ncp looking for
2511  * the directory vnode dvp.  ncp must be held but not locked on entry and
2512  * will be held on return.  dvp must be refd but not locked on entry and
2513  * will remain refd on return.
2514  *
2515  * Why do this at all?  Well, due to its stateless nature the NFS server
2516  * converts file handles directly to vnodes without necessarily going through
2517  * the namecache ops that would otherwise create the namecache topology
2518  * leading to the vnode.  We could either (1) Change the namecache algorithms
2519  * to allow disconnect namecache records that are re-merged opportunistically,
2520  * or (2) Make the NFS server backtrack and scan to recover a connected
2521  * namecache topology in order to then be able to issue new API lookups.
2522  *
2523  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2524  * namecache algorithms and introduces a lot of complication in every subsystem
2525  * that calls into the namecache to deal with the re-merge case, especially
2526  * since we are using the namecache to placehold negative lookups and the
2527  * vnode might not be immediately assigned. (2) is certainly far less
2528  * efficient then (1), but since we are only talking about directories here
2529  * (which are likely to remain cached), the case does not actually run all
2530  * that often and has the supreme advantage of not polluting the namecache
2531  * algorithms.
2532  *
2533  * If a fakename is supplied just construct a namecache entry using the
2534  * fake name.
2535  */
2536 static int
2537 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2538                        struct vnode *dvp, char *fakename)
2539 {
2540         struct nlcomponent nlc;
2541         struct nchandle rncp;
2542         struct dirent *den;
2543         struct vnode *pvp;
2544         struct vattr vat;
2545         struct iovec iov;
2546         struct uio uio;
2547         int blksize;
2548         int eofflag;
2549         int bytes;
2550         char *rbuf;
2551         int error;
2552
2553         vat.va_blocksize = 0;
2554         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2555                 return (error);
2556         cache_lock(nch);
2557         error = cache_vref(nch, cred, &pvp);
2558         cache_unlock(nch);
2559         if (error)
2560                 return (error);
2561         if (ncvp_debug) {
2562                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2563                         "vattr fileid = %lld\n",
2564                         nch->ncp, nch->ncp->nc_name,
2565                         vat.va_blocksize,
2566                         (long long)vat.va_fileid);
2567         }
2568
2569         /*
2570          * Use the supplied fakename if not NULL.  Fake names are typically
2571          * not in the actual filesystem hierarchy.  This is used by HAMMER
2572          * to glue @@timestamp recursions together.
2573          */
2574         if (fakename) {
2575                 nlc.nlc_nameptr = fakename;
2576                 nlc.nlc_namelen = strlen(fakename);
2577                 rncp = cache_nlookup(nch, &nlc);
2578                 goto done;
2579         }
2580
2581         if ((blksize = vat.va_blocksize) == 0)
2582                 blksize = DEV_BSIZE;
2583         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2584         rncp.ncp = NULL;
2585
2586         eofflag = 0;
2587         uio.uio_offset = 0;
2588 again:
2589         iov.iov_base = rbuf;
2590         iov.iov_len = blksize;
2591         uio.uio_iov = &iov;
2592         uio.uio_iovcnt = 1;
2593         uio.uio_resid = blksize;
2594         uio.uio_segflg = UIO_SYSSPACE;
2595         uio.uio_rw = UIO_READ;
2596         uio.uio_td = curthread;
2597
2598         if (ncvp_debug >= 2)
2599                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2600         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2601         if (error == 0) {
2602                 den = (struct dirent *)rbuf;
2603                 bytes = blksize - uio.uio_resid;
2604
2605                 while (bytes > 0) {
2606                         if (ncvp_debug >= 2) {
2607                                 kprintf("cache_inefficient_scan: %*.*s\n",
2608                                         den->d_namlen, den->d_namlen,
2609                                         den->d_name);
2610                         }
2611                         if (den->d_type != DT_WHT &&
2612                             den->d_ino == vat.va_fileid) {
2613                                 if (ncvp_debug) {
2614                                         kprintf("cache_inefficient_scan: "
2615                                                "MATCHED inode %lld path %s/%*.*s\n",
2616                                                (long long)vat.va_fileid,
2617                                                nch->ncp->nc_name,
2618                                                den->d_namlen, den->d_namlen,
2619                                                den->d_name);
2620                                 }
2621                                 nlc.nlc_nameptr = den->d_name;
2622                                 nlc.nlc_namelen = den->d_namlen;
2623                                 rncp = cache_nlookup(nch, &nlc);
2624                                 KKASSERT(rncp.ncp != NULL);
2625                                 break;
2626                         }
2627                         bytes -= _DIRENT_DIRSIZ(den);
2628                         den = _DIRENT_NEXT(den);
2629                 }
2630                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2631                         goto again;
2632         }
2633         kfree(rbuf, M_TEMP);
2634 done:
2635         vrele(pvp);
2636         if (rncp.ncp) {
2637                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2638                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2639                         if (ncvp_debug >= 2) {
2640                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2641                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2642                         }
2643                 } else {
2644                         if (ncvp_debug >= 2) {
2645                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2646                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2647                                         rncp.ncp->nc_vp);
2648                         }
2649                 }
2650                 if (rncp.ncp->nc_vp == NULL)
2651                         error = rncp.ncp->nc_error;
2652                 /*
2653                  * Release rncp after a successful nlookup.  rncp was fully
2654                  * referenced.
2655                  */
2656                 cache_put(&rncp);
2657         } else {
2658                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2659                         dvp, nch->ncp->nc_name);
2660                 error = ENOENT;
2661         }
2662         return (error);
2663 }
2664
2665 /*
2666  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2667  * state, which disassociates it from its vnode or ncneglist.
2668  *
2669  * Then, if there are no additional references to the ncp and no children,
2670  * the ncp is removed from the topology and destroyed.
2671  *
2672  * References and/or children may exist if the ncp is in the middle of the
2673  * topology, preventing the ncp from being destroyed.
2674  *
2675  * This function must be called with the ncp held and locked and will unlock
2676  * and drop it during zapping.
2677  *
2678  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2679  * This case can occur in the cache_drop() path.
2680  *
2681  * This function may returned a held (but NOT locked) parent node which the
2682  * caller must drop.  We do this so _cache_drop() can loop, to avoid
2683  * blowing out the kernel stack.
2684  *
2685  * WARNING!  For MPSAFE operation this routine must acquire up to three
2686  *           spin locks to be able to safely test nc_refs.  Lock order is
2687  *           very important.
2688  *
2689  *           hash spinlock if on hash list
2690  *           parent spinlock if child of parent
2691  *           (the ncp is unresolved so there is no vnode association)
2692  */
2693 static struct namecache *
2694 cache_zap(struct namecache *ncp, int nonblock)
2695 {
2696         struct namecache *par;
2697         struct vnode *dropvp;
2698         struct nchash_head *nchpp;
2699         int refs;
2700
2701         /*
2702          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2703          */
2704         _cache_setunresolved(ncp);
2705
2706         /*
2707          * Try to scrap the entry and possibly tail-recurse on its parent.
2708          * We only scrap unref'd (other then our ref) unresolved entries,
2709          * we do not scrap 'live' entries.
2710          *
2711          * Note that once the spinlocks are acquired if nc_refs == 1 no
2712          * other references are possible.  If it isn't, however, we have
2713          * to decrement but also be sure to avoid a 1->0 transition.
2714          */
2715         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2716         KKASSERT(ncp->nc_refs > 0);
2717
2718         /*
2719          * Acquire locks.  Note that the parent can't go away while we hold
2720          * a child locked.
2721          */
2722         nchpp = NULL;
2723         if ((par = ncp->nc_parent) != NULL) {
2724                 if (nonblock) {
2725                         for (;;) {
2726                                 if (_cache_lock_nonblock(par) == 0)
2727                                         break;
2728                                 refs = ncp->nc_refs;
2729                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2730                                 ++numdefered;   /* MP race ok */
2731                                 if (atomic_cmpset_int(&ncp->nc_refs,
2732                                                       refs, refs - 1)) {
2733                                         _cache_unlock(ncp);
2734                                         return(NULL);
2735                                 }
2736                                 cpu_pause();
2737                         }
2738                         _cache_hold(par);
2739                 } else {
2740                         _cache_hold(par);
2741                         _cache_lock(par);
2742                 }
2743                 nchpp = ncp->nc_head;
2744                 spin_lock(&nchpp->spin);
2745         }
2746
2747         /*
2748          * At this point if we find refs == 1 it should not be possible for
2749          * anyone else to have access to the ncp.  We are holding the only
2750          * possible access point left (nchpp) spin-locked.
2751          *
2752          * If someone other then us has a ref or we have children
2753          * we cannot zap the entry.  The 1->0 transition and any
2754          * further list operation is protected by the spinlocks
2755          * we have acquired but other transitions are not.
2756          */
2757         for (;;) {
2758                 refs = ncp->nc_refs;
2759                 cpu_ccfence();
2760                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2761                         break;
2762                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2763                         if (par) {
2764                                 spin_unlock(&nchpp->spin);
2765                                 _cache_put(par);
2766                         }
2767                         _cache_unlock(ncp);
2768                         return(NULL);
2769                 }
2770                 cpu_pause();
2771         }
2772
2773         /*
2774          * We are the only ref and with the spinlocks held no further
2775          * refs can be acquired by others.
2776          *
2777          * Remove us from the hash list and parent list.  We have to
2778          * drop a ref on the parent's vp if the parent's list becomes
2779          * empty.
2780          */
2781         dropvp = NULL;
2782         if (par) {
2783                 KKASSERT(nchpp == ncp->nc_head);
2784                 LIST_REMOVE(ncp, nc_hash);
2785                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2786                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
2787                         dropvp = par->nc_vp;
2788                 ncp->nc_head = NULL;
2789                 ncp->nc_parent = NULL;
2790                 spin_unlock(&nchpp->spin);
2791                 _cache_unlock(par);
2792         } else {
2793                 KKASSERT(ncp->nc_head == NULL);
2794         }
2795
2796         /*
2797          * ncp should not have picked up any refs.  Physically
2798          * destroy the ncp.
2799          */
2800         if (ncp->nc_refs != 1) {
2801                 int save_refs = ncp->nc_refs;
2802                 cpu_ccfence();
2803                 panic("cache_zap: %p bad refs %d (%d)\n",
2804                         ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0));
2805         }
2806         KKASSERT(ncp->nc_refs == 1);
2807         /* _cache_unlock(ncp) not required */
2808         ncp->nc_refs = -1;      /* safety */
2809         if (ncp->nc_name)
2810                 kfree(ncp->nc_name, M_VFSCACHE);
2811         kfree(ncp, M_VFSCACHE);
2812
2813         /*
2814          * Delayed drop (we had to release our spinlocks)
2815          *
2816          * The refed parent (if not  NULL) must be dropped.  The
2817          * caller is responsible for looping.
2818          */
2819         if (dropvp)
2820                 vdrop(dropvp);
2821         return(par);
2822 }
2823
2824 /*
2825  * Clean up dangling negative cache and defered-drop entries in the
2826  * namecache.
2827  *
2828  * This routine is called in the critical path and also called from
2829  * vnlru().  When called from vnlru we use a lower limit to try to
2830  * deal with the negative cache before the critical path has to start
2831  * dealing with it.
2832  */
2833 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2834
2835 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2836 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2837
2838 void
2839 cache_hysteresis(int critpath)
2840 {
2841         int poslimit;
2842         int neglimit = maxvnodes / ncnegfactor;
2843         int xnumcache = numcache;
2844
2845         if (critpath == 0)
2846                 neglimit = neglimit * 8 / 10;
2847
2848         /*
2849          * Don't cache too many negative hits.  We use hysteresis to reduce
2850          * the impact on the critical path.
2851          */
2852         switch(neg_cache_hysteresis_state[critpath]) {
2853         case CHI_LOW:
2854                 if (numneg > MINNEG && numneg > neglimit) {
2855                         if (critpath)
2856                                 _cache_cleanneg(ncnegflush);
2857                         else
2858                                 _cache_cleanneg(ncnegflush +
2859                                                 numneg - neglimit);
2860                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2861                 }
2862                 break;
2863         case CHI_HIGH:
2864                 if (numneg > MINNEG * 9 / 10 &&
2865                     numneg * 9 / 10 > neglimit
2866                 ) {
2867                         if (critpath)
2868                                 _cache_cleanneg(ncnegflush);
2869                         else
2870                                 _cache_cleanneg(ncnegflush +
2871                                                 numneg * 9 / 10 - neglimit);
2872                 } else {
2873                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
2874                 }
2875                 break;
2876         }
2877
2878         /*
2879          * Don't cache too many positive hits.  We use hysteresis to reduce
2880          * the impact on the critical path.
2881          *
2882          * Excessive positive hits can accumulate due to large numbers of
2883          * hardlinks (the vnode cache will not prevent hl ncps from growing
2884          * into infinity).
2885          */
2886         if ((poslimit = ncposlimit) == 0)
2887                 poslimit = maxvnodes * 2;
2888         if (critpath == 0)
2889                 poslimit = poslimit * 8 / 10;
2890
2891         switch(pos_cache_hysteresis_state[critpath]) {
2892         case CHI_LOW:
2893                 if (xnumcache > poslimit && xnumcache > MINPOS) {
2894                         if (critpath)
2895                                 _cache_cleanpos(ncposflush);
2896                         else
2897                                 _cache_cleanpos(ncposflush +
2898                                                 xnumcache - poslimit);
2899                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2900                 }
2901                 break;
2902         case CHI_HIGH:
2903                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2904                         if (critpath)
2905                                 _cache_cleanpos(ncposflush);
2906                         else
2907                                 _cache_cleanpos(ncposflush +
2908                                                 xnumcache - poslimit * 5 / 6);
2909                 } else {
2910                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
2911                 }
2912                 break;
2913         }
2914
2915         /*
2916          * Clean out dangling defered-zap ncps which could not
2917          * be cleanly dropped if too many build up.  Note
2918          * that numdefered is not an exact number as such ncps
2919          * can be reused and the counter is not handled in a MP
2920          * safe manner by design.
2921          */
2922         if (numdefered > neglimit) {
2923                 _cache_cleandefered();
2924         }
2925 }
2926
2927 /*
2928  * NEW NAMECACHE LOOKUP API
2929  *
2930  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2931  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2932  * is ALWAYS returned, eve if the supplied component is illegal.
2933  *
2934  * The resulting namecache entry should be returned to the system with
2935  * cache_put() or cache_unlock() + cache_drop().
2936  *
2937  * namecache locks are recursive but care must be taken to avoid lock order
2938  * reversals (hence why the passed par_nch must be unlocked).  Locking
2939  * rules are to order for parent traversals, not for child traversals.
2940  *
2941  * Nobody else will be able to manipulate the associated namespace (e.g.
2942  * create, delete, rename, rename-target) until the caller unlocks the
2943  * entry.
2944  *
2945  * The returned entry will be in one of three states:  positive hit (non-null
2946  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2947  * Unresolved entries must be resolved through the filesystem to associate the
2948  * vnode and/or determine whether a positive or negative hit has occured.
2949  *
2950  * It is not necessary to lock a directory in order to lock namespace under
2951  * that directory.  In fact, it is explicitly not allowed to do that.  A
2952  * directory is typically only locked when being created, renamed, or
2953  * destroyed.
2954  *
2955  * The directory (par) may be unresolved, in which case any returned child
2956  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2957  * the filesystem lookup requires a resolved directory vnode the caller is
2958  * responsible for resolving the namecache chain top-down.  This API
2959  * specifically allows whole chains to be created in an unresolved state.
2960  */
2961 struct nchandle
2962 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2963 {
2964         struct nchandle nch;
2965         struct namecache *ncp;
2966         struct namecache *new_ncp;
2967         struct nchash_head *nchpp;
2968         struct mount *mp;
2969         u_int32_t hash;
2970         globaldata_t gd;
2971         int par_locked;
2972
2973         gd = mycpu;
2974         mp = par_nch->mount;
2975         par_locked = 0;
2976
2977         /*
2978          * This is a good time to call it, no ncp's are locked by
2979          * the caller or us.
2980          */
2981         cache_hysteresis(1);
2982
2983         /*
2984          * Try to locate an existing entry
2985          */
2986         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2987         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2988         new_ncp = NULL;
2989         nchpp = NCHHASH(hash);
2990 restart:
2991         if (new_ncp)
2992                 spin_lock(&nchpp->spin);
2993         else
2994                 spin_lock_shared(&nchpp->spin);
2995
2996         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2997                 /*
2998                  * Break out if we find a matching entry.  Note that
2999                  * UNRESOLVED entries may match, but DESTROYED entries
3000                  * do not.
3001                  */
3002                 if (ncp->nc_parent == par_nch->ncp &&
3003                     ncp->nc_nlen == nlc->nlc_namelen &&
3004                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3005                     (ncp->nc_flag & NCF_DESTROYED) == 0
3006                 ) {
3007                         _cache_hold(ncp);
3008                         if (new_ncp)
3009                                 spin_unlock(&nchpp->spin);
3010                         else
3011                                 spin_unlock_shared(&nchpp->spin);
3012                         if (par_locked) {
3013                                 _cache_unlock(par_nch->ncp);
3014                                 par_locked = 0;
3015                         }
3016                         if (_cache_lock_special(ncp) == 0) {
3017                                 /*
3018                                  * Successfully locked but we must re-test
3019                                  * conditions that might have changed since
3020                                  * we did not have the lock before.
3021                                  */
3022                                 if (ncp->nc_parent != par_nch->ncp ||
3023                                     ncp->nc_nlen != nlc->nlc_namelen ||
3024                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3025                                          ncp->nc_nlen) ||
3026                                     (ncp->nc_flag & NCF_DESTROYED)) {
3027                                         _cache_put(ncp);
3028                                         goto restart;
3029                                 }
3030                                 _cache_auto_unresolve(mp, ncp);
3031                                 if (new_ncp)
3032                                         _cache_free(new_ncp);
3033                                 goto found;
3034                         }
3035                         _cache_get(ncp);        /* cycle the lock to block */
3036                         _cache_put(ncp);
3037                         _cache_drop(ncp);
3038                         goto restart;
3039                 }
3040         }
3041
3042         /*
3043          * We failed to locate an entry, create a new entry and add it to
3044          * the cache.  The parent ncp must also be locked so we
3045          * can link into it.
3046          *
3047          * We have to relookup after possibly blocking in kmalloc or
3048          * when locking par_nch.
3049          *
3050          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3051          *       mount case, in which case nc_name will be NULL.
3052          */
3053         if (new_ncp == NULL) {
3054                 spin_unlock_shared(&nchpp->spin);
3055                 new_ncp = cache_alloc(nlc->nlc_namelen);
3056                 if (nlc->nlc_namelen) {
3057                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3058                               nlc->nlc_namelen);
3059                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3060                 }
3061                 goto restart;
3062         }
3063
3064         /*
3065          * NOTE! The spinlock is held exclusively here because new_ncp
3066          *       is non-NULL.
3067          */
3068         if (par_locked == 0) {
3069                 spin_unlock(&nchpp->spin);
3070                 _cache_lock(par_nch->ncp);
3071                 par_locked = 1;
3072                 goto restart;
3073         }
3074
3075         /*
3076          * WARNING!  We still hold the spinlock.  We have to set the hash
3077          *           table entry atomically.
3078          */
3079         ncp = new_ncp;
3080         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3081         spin_unlock(&nchpp->spin);
3082         _cache_unlock(par_nch->ncp);
3083         /* par_locked = 0 - not used */
3084 found:
3085         /*
3086          * stats and namecache size management
3087          */
3088         if (ncp->nc_flag & NCF_UNRESOLVED)
3089                 ++gd->gd_nchstats->ncs_miss;
3090         else if (ncp->nc_vp)
3091                 ++gd->gd_nchstats->ncs_goodhits;
3092         else
3093                 ++gd->gd_nchstats->ncs_neghits;
3094         nch.mount = mp;
3095         nch.ncp = ncp;
3096         _cache_mntref(nch.mount);
3097
3098         return(nch);
3099 }
3100
3101 /*
3102  * Attempt to lookup a namecache entry and return with a shared namecache
3103  * lock.
3104  */
3105 int
3106 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
3107                            int excl, struct nchandle *res_nch)
3108 {
3109         struct namecache *ncp;
3110         struct nchash_head *nchpp;
3111         struct mount *mp;
3112         u_int32_t hash;
3113         globaldata_t gd;
3114
3115         /*
3116          * If exclusive requested or shared namecache locks are disabled,
3117          * return failure.
3118          */
3119         if (ncp_shared_lock_disable || excl)
3120                 return(EWOULDBLOCK);
3121
3122         gd = mycpu;
3123         mp = par_nch->mount;
3124
3125         /*
3126          * This is a good time to call it, no ncp's are locked by
3127          * the caller or us.
3128          */
3129         cache_hysteresis(1);
3130
3131         /*
3132          * Try to locate an existing entry
3133          */
3134         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3135         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3136         nchpp = NCHHASH(hash);
3137
3138         spin_lock_shared(&nchpp->spin);
3139
3140         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
3141                 /*
3142                  * Break out if we find a matching entry.  Note that
3143                  * UNRESOLVED entries may match, but DESTROYED entries
3144                  * do not.
3145                  */
3146                 if (ncp->nc_parent == par_nch->ncp &&
3147                     ncp->nc_nlen == nlc->nlc_namelen &&
3148                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3149                     (ncp->nc_flag & NCF_DESTROYED) == 0
3150                 ) {
3151                         _cache_hold(ncp);
3152                         spin_unlock_shared(&nchpp->spin);
3153                         if (_cache_lock_shared_special(ncp) == 0) {
3154                                 if (ncp->nc_parent == par_nch->ncp &&
3155                                     ncp->nc_nlen == nlc->nlc_namelen &&
3156                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3157                                          ncp->nc_nlen) == 0 &&
3158                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3159                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3160                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
3161                                         goto found;
3162                                 }
3163                                 _cache_unlock(ncp);
3164                         }
3165                         _cache_drop(ncp);
3166                         spin_lock_shared(&nchpp->spin);
3167                         break;
3168                 }
3169         }
3170
3171         /*
3172          * Failure
3173          */
3174         spin_unlock_shared(&nchpp->spin);
3175         return(EWOULDBLOCK);
3176
3177         /*
3178          * Success
3179          *
3180          * Note that nc_error might be non-zero (e.g ENOENT).
3181          */
3182 found:
3183         res_nch->mount = mp;
3184         res_nch->ncp = ncp;
3185         ++gd->gd_nchstats->ncs_goodhits;
3186         _cache_mntref(res_nch->mount);
3187
3188         KKASSERT(ncp->nc_error != EWOULDBLOCK);
3189         return(ncp->nc_error);
3190 }
3191
3192 /*
3193  * This is a non-blocking verison of cache_nlookup() used by
3194  * nfs_readdirplusrpc_uio().  It can fail for any reason and
3195  * will return nch.ncp == NULL in that case.
3196  */
3197 struct nchandle
3198 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3199 {
3200         struct nchandle nch;
3201         struct namecache *ncp;
3202         struct namecache *new_ncp;
3203         struct nchash_head *nchpp;
3204         struct mount *mp;
3205         u_int32_t hash;
3206         globaldata_t gd;
3207         int par_locked;
3208
3209         gd = mycpu;
3210         mp = par_nch->mount;
3211         par_locked = 0;
3212
3213         /*
3214          * Try to locate an existing entry
3215          */
3216         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3217         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3218         new_ncp = NULL;
3219         nchpp = NCHHASH(hash);
3220 restart:
3221         spin_lock(&nchpp->spin);
3222         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
3223                 /*
3224                  * Break out if we find a matching entry.  Note that
3225                  * UNRESOLVED entries may match, but DESTROYED entries
3226                  * do not.
3227                  */
3228                 if (ncp->nc_parent == par_nch->ncp &&
3229                     ncp->nc_nlen == nlc->nlc_namelen &&
3230                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3231                     (ncp->nc_flag & NCF_DESTROYED) == 0
3232                 ) {
3233                         _cache_hold(ncp);
3234                         spin_unlock(&nchpp->spin);
3235                         if (par_locked) {
3236                                 _cache_unlock(par_nch->ncp);
3237                                 par_locked = 0;
3238                         }
3239                         if (_cache_lock_special(ncp) == 0) {
3240                                 if (ncp->nc_parent != par_nch->ncp ||
3241                                     ncp->nc_nlen != nlc->nlc_namelen ||
3242                                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3243                                     (ncp->nc_flag & NCF_DESTROYED)) {
3244                                         kprintf("cache_lookup_nonblock: "
3245                                                 "ncp-race %p %*.*s\n",
3246                                                 ncp,
3247                                                 nlc->nlc_namelen,
3248                                                 nlc->nlc_namelen,
3249                                                 nlc->nlc_nameptr);
3250                                         _cache_unlock(ncp);
3251                                         _cache_drop(ncp);
3252                                         goto failed;
3253                                 }
3254                                 _cache_auto_unresolve(mp, ncp);
3255                                 if (new_ncp) {
3256                                         _cache_free(new_ncp);
3257                                         new_ncp = NULL;
3258                                 }
3259                                 goto found;
3260                         }
3261                         _cache_drop(ncp);
3262                         goto failed;
3263                 }
3264         }
3265
3266         /*
3267          * We failed to locate an entry, create a new entry and add it to
3268          * the cache.  The parent ncp must also be locked so we
3269          * can link into it.
3270          *
3271          * We have to relookup after possibly blocking in kmalloc or
3272          * when locking par_nch.
3273          *
3274          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3275          *       mount case, in which case nc_name will be NULL.
3276          */
3277         if (new_ncp == NULL) {
3278                 spin_unlock(&nchpp->spin);
3279                 new_ncp = cache_alloc(nlc->nlc_namelen);
3280                 if (nlc->nlc_namelen) {
3281                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3282                               nlc->nlc_namelen);
3283                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3284                 }
3285                 goto restart;
3286         }
3287         if (par_locked == 0) {
3288                 spin_unlock(&nchpp->spin);
3289                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3290                         par_locked = 1;
3291                         goto restart;
3292                 }
3293                 goto failed;
3294         }
3295
3296         /*
3297          * WARNING!  We still hold the spinlock.  We have to set the hash
3298          *           table entry atomically.
3299          */
3300         ncp = new_ncp;
3301         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3302         spin_unlock(&nchpp->spin);
3303         _cache_unlock(par_nch->ncp);
3304         /* par_locked = 0 - not used */
3305 found:
3306         /*
3307          * stats and namecache size management
3308          */
3309         if (ncp->nc_flag & NCF_UNRESOLVED)
3310                 ++gd->gd_nchstats->ncs_miss;
3311         else if (ncp->nc_vp)
3312                 ++gd->gd_nchstats->ncs_goodhits;
3313         else
3314                 ++gd->gd_nchstats->ncs_neghits;
3315         nch.mount = mp;
3316         nch.ncp = ncp;
3317         _cache_mntref(nch.mount);
3318
3319         return(nch);
3320 failed:
3321         if (new_ncp) {
3322                 _cache_free(new_ncp);
3323                 new_ncp = NULL;
3324         }
3325         nch.mount = NULL;
3326         nch.ncp = NULL;
3327         return(nch);
3328 }
3329
3330 /*
3331  * The namecache entry is marked as being used as a mount point.
3332  * Locate the mount if it is visible to the caller.  The DragonFly
3333  * mount system allows arbitrary loops in the topology and disentangles
3334  * those loops by matching against (mp, ncp) rather than just (ncp).
3335  * This means any given ncp can dive any number of mounts, depending
3336  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3337  *
3338  * We use a very simple frontend cache to reduce SMP conflicts,
3339  * which we have to do because the mountlist scan needs an exclusive
3340  * lock around its ripout info list.  Not to mention that there might
3341  * be a lot of mounts.
3342  */
3343 struct findmount_info {
3344         struct mount *result;
3345         struct mount *nch_mount;
3346         struct namecache *nch_ncp;
3347 };
3348
3349 static
3350 struct ncmount_cache *
3351 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3352 {
3353         int hash;
3354
3355         hash = ((int)(intptr_t)mp / sizeof(*mp)) ^
3356                ((int)(intptr_t)ncp / sizeof(*ncp));
3357         hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE;
3358         return (&ncmount_cache[hash]);
3359 }
3360
3361 static
3362 int
3363 cache_findmount_callback(struct mount *mp, void *data)
3364 {
3365         struct findmount_info *info = data;
3366
3367         /*
3368          * Check the mount's mounted-on point against the passed nch.
3369          */
3370         if (mp->mnt_ncmounton.mount == info->nch_mount &&
3371             mp->mnt_ncmounton.ncp == info->nch_ncp
3372         ) {
3373             info->result = mp;
3374             _cache_mntref(mp);
3375             return(-1);
3376         }
3377         return(0);
3378 }
3379
3380 struct mount *
3381 cache_findmount(struct nchandle *nch)
3382 {
3383         struct findmount_info info;
3384         struct ncmount_cache *ncc;
3385         struct mount *mp;
3386
3387         /*
3388          * Fast
3389          */
3390         if (ncmount_cache_enable == 0) {
3391                 ncc = NULL;
3392                 goto skip;
3393         }
3394         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3395         if (ncc->ncp == nch->ncp) {
3396                 spin_lock_shared(&ncc->spin);
3397                 if (ncc->isneg == 0 &&
3398                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
3399                         if (mp->mnt_ncmounton.mount == nch->mount &&
3400                             mp->mnt_ncmounton.ncp == nch->ncp) {
3401                                 /*
3402                                  * Cache hit (positive)
3403                                  */
3404                                 _cache_mntref(mp);
3405                                 spin_unlock_shared(&ncc->spin);
3406                                 ++ncmount_cache_hit;
3407                                 return(mp);
3408                         }
3409                         /* else cache miss */
3410                 }
3411                 if (ncc->isneg &&
3412                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3413                         /*
3414                          * Cache hit (negative)
3415                          */
3416                         spin_unlock_shared(&ncc->spin);
3417                         ++ncmount_cache_hit;
3418                         return(NULL);
3419                 }
3420                 spin_unlock_shared(&ncc->spin);
3421         }
3422 skip:
3423
3424         /*
3425          * Slow
3426          */
3427         info.result = NULL;
3428         info.nch_mount = nch->mount;
3429         info.nch_ncp = nch->ncp;
3430         mountlist_scan(cache_findmount_callback, &info,
3431                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
3432
3433         /*
3434          * Cache the result.
3435          *
3436          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
3437          *                   only used for pointer comparisons and is not
3438          *                   referenced (otherwise there would be dangling
3439          *                   refs).
3440          *
3441          * Positive lookups: We cache the originating {ncp} and the target
3442          *                   (mp).  (mp) is referenced.
3443          *
3444          * Indeterminant:    If the match is undergoing an unmount we do
3445          *                   not cache it to avoid racing cache_unmounting(),
3446          *                   but still return the match.
3447          */
3448         if (ncc) {
3449                 spin_lock(&ncc->spin);
3450                 if (info.result == NULL) {
3451                         if (ncc->isneg == 0 && ncc->mp)
3452                                 _cache_mntrel(ncc->mp);
3453                         ncc->ncp = nch->ncp;
3454                         ncc->mp = nch->mount;
3455                         ncc->isneg = 1;
3456                         spin_unlock(&ncc->spin);
3457                         ++ncmount_cache_overwrite;
3458                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
3459                         if (ncc->isneg == 0 && ncc->mp)
3460                                 _cache_mntrel(ncc->mp);
3461                         _cache_mntref(info.result);
3462                         ncc->ncp = nch->ncp;
3463                         ncc->mp = info.result;
3464                         ncc->isneg = 0;
3465                         spin_unlock(&ncc->spin);
3466                         ++ncmount_cache_overwrite;
3467                 } else {
3468                         spin_unlock(&ncc->spin);
3469                 }
3470                 ++ncmount_cache_miss;
3471         }
3472         return(info.result);
3473 }
3474
3475 void
3476 cache_dropmount(struct mount *mp)
3477 {
3478         _cache_mntrel(mp);
3479 }
3480
3481 void
3482 cache_ismounting(struct mount *mp)
3483 {
3484         struct nchandle *nch = &mp->mnt_ncmounton;
3485         struct ncmount_cache *ncc;
3486
3487         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3488         if (ncc->isneg &&
3489             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3490                 spin_lock(&ncc->spin);
3491                 if (ncc->isneg &&
3492                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3493                         ncc->ncp = NULL;
3494                         ncc->mp = NULL;
3495                 }
3496                 spin_unlock(&ncc->spin);
3497         }
3498 }
3499
3500 void
3501 cache_unmounting(struct mount *mp)
3502 {
3503         struct nchandle *nch = &mp->mnt_ncmounton;
3504         struct ncmount_cache *ncc;
3505
3506         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3507         if (ncc->isneg == 0 &&
3508             ncc->ncp == nch->ncp && ncc->mp == mp) {
3509                 spin_lock(&ncc->spin);
3510                 if (ncc->isneg == 0 &&
3511                     ncc->ncp == nch->ncp && ncc->mp == mp) {
3512                         _cache_mntrel(mp);
3513                         ncc->ncp = NULL;
3514                         ncc->mp = NULL;
3515                 }
3516                 spin_unlock(&ncc->spin);
3517         }
3518 }
3519
3520 /*
3521  * Resolve an unresolved namecache entry, generally by looking it up.
3522  * The passed ncp must be locked and refd.
3523  *
3524  * Theoretically since a vnode cannot be recycled while held, and since
3525  * the nc_parent chain holds its vnode as long as children exist, the
3526  * direct parent of the cache entry we are trying to resolve should
3527  * have a valid vnode.  If not then generate an error that we can
3528  * determine is related to a resolver bug.
3529  *
3530  * However, if a vnode was in the middle of a recyclement when the NCP
3531  * got locked, ncp->nc_vp might point to a vnode that is about to become
3532  * invalid.  cache_resolve() handles this case by unresolving the entry
3533  * and then re-resolving it.
3534  *
3535  * Note that successful resolution does not necessarily return an error
3536  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3537  * will be returned.
3538  */
3539 int
3540 cache_resolve(struct nchandle *nch, struct ucred *cred)
3541 {
3542         struct namecache *par_tmp;
3543         struct namecache *par;
3544         struct namecache *ncp;
3545         struct nchandle nctmp;
3546         struct mount *mp;
3547         struct vnode *dvp;
3548         int error;
3549
3550         ncp = nch->ncp;
3551         mp = nch->mount;
3552         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3553 restart:
3554         /*
3555          * If the ncp is already resolved we have nothing to do.  However,
3556          * we do want to guarentee that a usable vnode is returned when
3557          * a vnode is present, so make sure it hasn't been reclaimed.
3558          */
3559         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3560                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3561                         _cache_setunresolved(ncp);
3562                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3563                         return (ncp->nc_error);
3564         }
3565
3566         /*
3567          * If the ncp was destroyed it will never resolve again.  This
3568          * can basically only happen when someone is chdir'd into an
3569          * empty directory which is then rmdir'd.  We want to catch this
3570          * here and not dive the VFS because the VFS might actually
3571          * have a way to re-resolve the disconnected ncp, which will
3572          * result in inconsistencies in the cdir/nch for proc->p_fd.
3573          */
3574         if (ncp->nc_flag & NCF_DESTROYED)
3575                 return(EINVAL);
3576
3577         /*
3578          * Mount points need special handling because the parent does not
3579          * belong to the same filesystem as the ncp.
3580          */
3581         if (ncp == mp->mnt_ncmountpt.ncp)
3582                 return (cache_resolve_mp(mp));
3583
3584         /*
3585          * We expect an unbroken chain of ncps to at least the mount point,
3586          * and even all the way to root (but this code doesn't have to go
3587          * past the mount point).
3588          */
3589         if (ncp->nc_parent == NULL) {
3590                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3591                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3592                 ncp->nc_error = EXDEV;
3593                 return(ncp->nc_error);
3594         }
3595
3596         /*
3597          * The vp's of the parent directories in the chain are held via vhold()
3598          * due to the existance of the child, and should not disappear.
3599          * However, there are cases where they can disappear:
3600          *
3601          *      - due to filesystem I/O errors.
3602          *      - due to NFS being stupid about tracking the namespace and
3603          *        destroys the namespace for entire directories quite often.
3604          *      - due to forced unmounts.
3605          *      - due to an rmdir (parent will be marked DESTROYED)
3606          *
3607          * When this occurs we have to track the chain backwards and resolve
3608          * it, looping until the resolver catches up to the current node.  We
3609          * could recurse here but we might run ourselves out of kernel stack
3610          * so we do it in a more painful manner.  This situation really should
3611          * not occur all that often, or if it does not have to go back too
3612          * many nodes to resolve the ncp.
3613          */
3614         while ((dvp = cache_dvpref(ncp)) == NULL) {
3615                 /*
3616                  * This case can occur if a process is CD'd into a
3617                  * directory which is then rmdir'd.  If the parent is marked
3618                  * destroyed there is no point trying to resolve it.
3619                  */
3620                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3621                         return(ENOENT);
3622                 par = ncp->nc_parent;
3623                 _cache_hold(par);
3624                 _cache_lock(par);
3625                 while ((par_tmp = par->nc_parent) != NULL &&
3626                        par_tmp->nc_vp == NULL) {
3627                         _cache_hold(par_tmp);
3628                         _cache_lock(par_tmp);
3629                         _cache_put(par);
3630                         par = par_tmp;
3631                 }
3632                 if (par->nc_parent == NULL) {
3633                         kprintf("EXDEV case 2 %*.*s\n",
3634                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3635                         _cache_put(par);
3636                         return (EXDEV);
3637                 }
3638                 /*
3639                  * The parent is not set in stone, ref and lock it to prevent
3640                  * it from disappearing.  Also note that due to renames it
3641                  * is possible for our ncp to move and for par to no longer
3642                  * be one of its parents.  We resolve it anyway, the loop
3643                  * will handle any moves.
3644                  */
3645                 _cache_get(par);        /* additional hold/lock */
3646                 _cache_put(par);        /* from earlier hold/lock */
3647                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3648                         cache_resolve_mp(nch->mount);
3649                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3650                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
3651                         _cache_put(par);
3652                         continue;
3653                 } else {
3654                         if (par->nc_flag & NCF_UNRESOLVED) {
3655                                 nctmp.mount = mp;
3656                                 nctmp.ncp = par;
3657                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3658                         }
3659                         vrele(dvp);
3660                 }
3661                 if ((error = par->nc_error) != 0) {
3662                         if (par->nc_error != EAGAIN) {
3663                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3664                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3665                                     par->nc_error);
3666                                 _cache_put(par);
3667                                 return(error);
3668                         }
3669                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3670                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3671                 }
3672                 _cache_put(par);
3673                 /* loop */
3674         }
3675
3676         /*
3677          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3678          * ncp's and reattach them.  If this occurs the original ncp is marked
3679          * EAGAIN to force a relookup.
3680          *
3681          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3682          * ncp must already be resolved.
3683          */
3684         if (dvp) {
3685                 nctmp.mount = mp;
3686                 nctmp.ncp = ncp;
3687                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3688                 vrele(dvp);
3689         } else {
3690                 ncp->nc_error = EPERM;
3691         }
3692         if (ncp->nc_error == EAGAIN) {
3693                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3694                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3695                 goto restart;
3696         }
3697         return(ncp->nc_error);
3698 }
3699
3700 /*
3701  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3702  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3703  * re-resolution more often due to its mac-truck-smash-the-namecache
3704  * method of tracking namespace changes.
3705  *
3706  * The semantics for this call is that the passed ncp must be locked on
3707  * entry and will be locked on return.  However, if we actually have to
3708  * resolve the mount point we temporarily unlock the entry in order to
3709  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3710  * the unlock we have to recheck the flags after we relock.
3711  */
3712 static int
3713 cache_resolve_mp(struct mount *mp)
3714 {
3715         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3716         struct vnode *vp;
3717         int error;
3718
3719         KKASSERT(mp != NULL);
3720
3721         /*
3722          * If the ncp is already resolved we have nothing to do.  However,
3723          * we do want to guarentee that a usable vnode is returned when
3724          * a vnode is present, so make sure it hasn't been reclaimed.
3725          */
3726         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3727                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3728                         _cache_setunresolved(ncp);
3729         }
3730
3731         if (ncp->nc_flag & NCF_UNRESOLVED) {
3732                 _cache_unlock(ncp);
3733                 while (vfs_busy(mp, 0))
3734                         ;
3735                 error = VFS_ROOT(mp, &vp);
3736                 _cache_lock(ncp);
3737
3738                 /*
3739                  * recheck the ncp state after relocking.
3740                  */
3741                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3742                         ncp->nc_error = error;
3743                         if (error == 0) {
3744                                 _cache_setvp(mp, ncp, vp);
3745                                 vput(vp);
3746                         } else {
3747                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3748                                         " to resolve mount %p err=%d ncp=%p\n",
3749                                         mp, error, ncp);
3750                                 _cache_setvp(mp, ncp, NULL);
3751                         }
3752                 } else if (error == 0) {
3753                         vput(vp);
3754                 }
3755                 vfs_unbusy(mp);
3756         }
3757         return(ncp->nc_error);
3758 }
3759
3760 /*
3761  * Clean out negative cache entries when too many have accumulated.
3762  */
3763 static void
3764 _cache_cleanneg(int count)
3765 {
3766         struct namecache *ncp;
3767
3768         /*
3769          * Attempt to clean out the specified number of negative cache
3770          * entries.
3771          */
3772         while (count) {
3773                 spin_lock(&ncspin);
3774                 ncp = TAILQ_FIRST(&ncneglist);
3775                 if (ncp == NULL) {
3776                         spin_unlock(&ncspin);
3777                         break;
3778                 }
3779                 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
3780                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
3781                 _cache_hold(ncp);
3782                 spin_unlock(&ncspin);
3783
3784                 /*
3785                  * This can race, so we must re-check that the ncp
3786                  * is on the ncneglist after successfully locking it.
3787                  */
3788                 if (_cache_lock_special(ncp) == 0) {
3789                         if (ncp->nc_vp == NULL &&
3790                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3791                                 ncp = cache_zap(ncp, 1);
3792                                 if (ncp)
3793                                         _cache_drop(ncp);
3794                         } else {
3795                                 kprintf("cache_cleanneg: race avoided\n");
3796                                 _cache_unlock(ncp);
3797                         }
3798                 } else {
3799                         _cache_drop(ncp);
3800                 }
3801                 --count;
3802         }
3803 }
3804
3805 /*
3806  * Clean out positive cache entries when too many have accumulated.
3807  */
3808 static void
3809 _cache_cleanpos(int count)
3810 {
3811         static volatile int rover;
3812         struct nchash_head *nchpp;
3813         struct namecache *ncp;
3814         int rover_copy;
3815
3816         /*
3817          * Attempt to clean out the specified number of negative cache
3818          * entries.
3819          */
3820         while (count) {
3821                 rover_copy = ++rover;   /* MPSAFEENOUGH */
3822                 cpu_ccfence();
3823                 nchpp = NCHHASH(rover_copy);
3824
3825                 spin_lock_shared(&nchpp->spin);
3826                 ncp = LIST_FIRST(&nchpp->list);
3827                 while (ncp && (ncp->nc_flag & NCF_DESTROYED))
3828                         ncp = LIST_NEXT(ncp, nc_hash);
3829                 if (ncp)
3830                         _cache_hold(ncp);
3831                 spin_unlock_shared(&nchpp->spin);
3832
3833                 if (ncp) {
3834                         if (_cache_lock_special(ncp) == 0) {
3835                                 ncp = cache_zap(ncp, 1);
3836                                 if (ncp)
3837                                         _cache_drop(ncp);
3838                         } else {
3839                                 _cache_drop(ncp);
3840                         }
3841                 }
3842                 --count;
3843         }
3844 }
3845
3846 /*
3847  * This is a kitchen sink function to clean out ncps which we
3848  * tried to zap from cache_drop() but failed because we were
3849  * unable to acquire the parent lock.
3850  *
3851  * Such entries can also be removed via cache_inval_vp(), such
3852  * as when unmounting.
3853  */
3854 static void
3855 _cache_cleandefered(void)
3856 {
3857         struct nchash_head *nchpp;
3858         struct namecache *ncp;
3859         struct namecache dummy;
3860         int i;
3861
3862         numdefered = 0;
3863         bzero(&dummy, sizeof(dummy));
3864         dummy.nc_flag = NCF_DESTROYED;
3865         dummy.nc_refs = 1;
3866
3867         for (i = 0; i <= nchash; ++i) {
3868                 nchpp = &nchashtbl[i];
3869
3870                 spin_lock(&nchpp->spin);
3871                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
3872                 ncp = &dummy;
3873                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
3874                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
3875                                 continue;
3876                         LIST_REMOVE(&dummy, nc_hash);
3877                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
3878                         _cache_hold(ncp);
3879                         spin_unlock(&nchpp->spin);
3880                         if (_cache_lock_nonblock(ncp) == 0) {
3881                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
3882                                 _cache_unlock(ncp);
3883                         }
3884                         _cache_drop(ncp);
3885                         spin_lock(&nchpp->spin);
3886                         ncp = &dummy;
3887                 }
3888                 LIST_REMOVE(&dummy, nc_hash);
3889                 spin_unlock(&nchpp->spin);
3890         }
3891 }
3892
3893 /*
3894  * Name cache initialization, from vfsinit() when we are booting
3895  */
3896 void
3897 nchinit(void)
3898 {
3899         int i;
3900         globaldata_t gd;
3901
3902         /*
3903          * Initialise per-cpu namecache effectiveness statistics.
3904          */
3905         for (i = 0; i < ncpus; ++i) {
3906                 gd = globaldata_find(i);
3907                 gd->gd_nchstats = &nchstats[i];
3908         }
3909
3910         /*
3911          * Create a generous namecache hash table
3912          */
3913         TAILQ_INIT(&ncneglist);
3914         spin_init(&ncspin, "nchinit");
3915         nchashtbl = hashinit_ext(vfs_inodehashsize(),
3916                                  sizeof(struct nchash_head),
3917                                  M_VFSCACHE, &nchash);
3918         for (i = 0; i <= (int)nchash; ++i) {
3919                 LIST_INIT(&nchashtbl[i].list);
3920                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
3921         }
3922         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
3923                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
3924         nclockwarn = 5 * hz;
3925 }
3926
3927 /*
3928  * Called from start_init() to bootstrap the root filesystem.  Returns
3929  * a referenced, unlocked namecache record.
3930  */
3931 void
3932 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
3933 {
3934         nch->ncp = cache_alloc(0);
3935         nch->mount = mp;
3936         _cache_mntref(mp);
3937         if (vp)
3938                 _cache_setvp(nch->mount, nch->ncp, vp);
3939 }
3940
3941 /*
3942  * vfs_cache_setroot()
3943  *
3944  *      Create an association between the root of our namecache and
3945  *      the root vnode.  This routine may be called several times during
3946  *      booting.
3947  *
3948  *      If the caller intends to save the returned namecache pointer somewhere
3949  *      it must cache_hold() it.
3950  */
3951 void
3952 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
3953 {
3954         struct vnode *ovp;
3955         struct nchandle onch;
3956
3957         ovp = rootvnode;
3958         onch = rootnch;
3959         rootvnode = nvp;
3960         if (nch)
3961                 rootnch = *nch;
3962         else
3963                 cache_zero(&rootnch);
3964         if (ovp)
3965                 vrele(ovp);
3966         if (onch.ncp)
3967                 cache_drop(&onch);
3968 }
3969
3970 /*
3971  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
3972  * topology and is being removed as quickly as possible.  The new VOP_N*()
3973  * API calls are required to make specific adjustments using the supplied
3974  * ncp pointers rather then just bogusly purging random vnodes.
3975  *
3976  * Invalidate all namecache entries to a particular vnode as well as
3977  * any direct children of that vnode in the namecache.  This is a
3978  * 'catch all' purge used by filesystems that do not know any better.
3979  *
3980  * Note that the linkage between the vnode and its namecache entries will
3981  * be removed, but the namecache entries themselves might stay put due to
3982  * active references from elsewhere in the system or due to the existance of
3983  * the children.   The namecache topology is left intact even if we do not
3984  * know what the vnode association is.  Such entries will be marked
3985  * NCF_UNRESOLVED.
3986  */
3987 void
3988 cache_purge(struct vnode *vp)
3989 {
3990         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
3991 }
3992
3993 static int disablecwd;
3994 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
3995     "Disable getcwd");
3996
3997 static u_long numcwdcalls;
3998 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
3999     "Number of current directory resolution calls");
4000 static u_long numcwdfailnf;
4001 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
4002     "Number of current directory failures due to lack of file");
4003 static u_long numcwdfailsz;
4004 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
4005     "Number of current directory failures due to large result");
4006 static u_long numcwdfound;
4007 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
4008     "Number of current directory resolution successes");
4009
4010 /*
4011  * MPALMOSTSAFE
4012  */
4013 int
4014 sys___getcwd(struct __getcwd_args *uap)
4015 {
4016         u_int buflen;
4017         int error;
4018         char *buf;
4019         char *bp;
4020
4021         if (disablecwd)
4022                 return (ENODEV);
4023
4024         buflen = uap->buflen;
4025         if (buflen == 0)
4026                 return (EINVAL);
4027         if (buflen > MAXPATHLEN)
4028                 buflen = MAXPATHLEN;
4029
4030         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
4031         bp = kern_getcwd(buf, buflen, &error);
4032         if (error == 0)
4033                 error = copyout(bp, uap->buf, strlen(bp) + 1);
4034         kfree(buf, M_TEMP);
4035         return (error);
4036 }
4037
4038 char *
4039 kern_getcwd(char *buf, size_t buflen, int *error)
4040 {
4041         struct proc *p = curproc;
4042         char *bp;
4043         int i, slash_prefixed;
4044         struct filedesc *fdp;
4045         struct nchandle nch;
4046         struct namecache *ncp;
4047
4048         numcwdcalls++;
4049         bp = buf;
4050         bp += buflen - 1;
4051         *bp = '\0';
4052         fdp = p->p_fd;
4053         slash_prefixed = 0;
4054
4055         nch = fdp->fd_ncdir;
4056         ncp = nch.ncp;
4057         if (ncp)
4058                 _cache_hold(ncp);
4059
4060         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
4061                nch.mount != fdp->fd_nrdir.mount)
4062         ) {
4063                 /*
4064                  * While traversing upwards if we encounter the root
4065                  * of the current mount we have to skip to the mount point
4066                  * in the underlying filesystem.
4067                  */
4068                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
4069                         nch = nch.mount->mnt_ncmounton;
4070                         _cache_drop(ncp);
4071                         ncp = nch.ncp;
4072                         if (ncp)
4073                                 _cache_hold(ncp);
4074                         continue;
4075                 }
4076
4077                 /*
4078                  * Prepend the path segment
4079                  */
4080                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4081                         if (bp == buf) {
4082                                 numcwdfailsz++;
4083                                 *error = ERANGE;
4084                                 bp = NULL;
4085                                 goto done;
4086                         }
4087                         *--bp = ncp->nc_name[i];
4088                 }
4089                 if (bp == buf) {
4090                         numcwdfailsz++;
4091                         *error = ERANGE;
4092                         bp = NULL;
4093                         goto done;
4094                 }
4095                 *--bp = '/';
4096                 slash_prefixed = 1;
4097
4098                 /*
4099                  * Go up a directory.  This isn't a mount point so we don't
4100                  * have to check again.
4101                  */
4102                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4103                         if (ncp_shared_lock_disable)
4104                                 _cache_lock(ncp);
4105                         else
4106                                 _cache_lock_shared(ncp);
4107                         if (nch.ncp != ncp->nc_parent) {
4108                                 _cache_unlock(ncp);
4109                                 continue;
4110                         }
4111                         _cache_hold(nch.ncp);
4112                         _cache_unlock(ncp);
4113                         break;
4114                 }
4115                 _cache_drop(ncp);
4116                 ncp = nch.ncp;
4117         }
4118         if (ncp == NULL) {
4119                 numcwdfailnf++;
4120                 *error = ENOENT;
4121                 bp = NULL;
4122                 goto done;
4123         }
4124         if (!slash_prefixed) {
4125                 if (bp == buf) {
4126                         numcwdfailsz++;
4127                         *error = ERANGE;
4128                         bp = NULL;
4129                         goto done;
4130                 }
4131                 *--bp = '/';
4132         }
4133         numcwdfound++;
4134         *error = 0;
4135 done:
4136         if (ncp)
4137                 _cache_drop(ncp);
4138         return (bp);
4139 }
4140
4141 /*
4142  * Thus begins the fullpath magic.
4143  *
4144  * The passed nchp is referenced but not locked.
4145  */
4146 static int disablefullpath;
4147 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
4148     &disablefullpath, 0,
4149     "Disable fullpath lookups");
4150
4151 static u_int numfullpathcalls;
4152 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
4153     &numfullpathcalls, 0,
4154     "Number of full path resolutions in progress");
4155 static u_int numfullpathfailnf;
4156 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
4157     &numfullpathfailnf, 0,
4158     "Number of full path resolution failures due to lack of file");
4159 static u_int numfullpathfailsz;
4160 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
4161     &numfullpathfailsz, 0,
4162     "Number of full path resolution failures due to insufficient memory");
4163 static u_int numfullpathfound;
4164 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
4165     &numfullpathfound, 0,
4166     "Number of full path resolution successes");
4167
4168 int
4169 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
4170                char **retbuf, char **freebuf, int guess)
4171 {
4172         struct nchandle fd_nrdir;
4173         struct nchandle nch;
4174         struct namecache *ncp;
4175         struct mount *mp, *new_mp;
4176         char *bp, *buf;
4177         int slash_prefixed;
4178         int error = 0;
4179         int i;
4180
4181         atomic_add_int(&numfullpathcalls, -1);
4182
4183         *retbuf = NULL;
4184         *freebuf = NULL;
4185
4186         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
4187         bp = buf + MAXPATHLEN - 1;
4188         *bp = '\0';
4189         if (nchbase)
4190                 fd_nrdir = *nchbase;
4191         else if (p != NULL)
4192                 fd_nrdir = p->p_fd->fd_nrdir;
4193         else
4194                 fd_nrdir = rootnch;
4195         slash_prefixed = 0;
4196         nch = *nchp;
4197         ncp = nch.ncp;
4198         if (ncp)
4199                 _cache_hold(ncp);
4200         mp = nch.mount;
4201
4202         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
4203                 new_mp = NULL;
4204
4205                 /*
4206                  * If we are asked to guess the upwards path, we do so whenever
4207                  * we encounter an ncp marked as a mountpoint. We try to find
4208                  * the actual mountpoint by finding the mountpoint with this
4209                  * ncp.
4210                  */
4211                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
4212                         new_mp = mount_get_by_nc(ncp);
4213                 }
4214                 /*
4215                  * While traversing upwards if we encounter the root
4216                  * of the current mount we have to skip to the mount point.
4217                  */
4218                 if (ncp == mp->mnt_ncmountpt.ncp) {
4219                         new_mp = mp;
4220                 }
4221                 if (new_mp) {
4222                         nch = new_mp->mnt_ncmounton;
4223                         _cache_drop(ncp);
4224                         ncp = nch.ncp;
4225                         if (ncp)
4226                                 _cache_hold(ncp);
4227                         mp = nch.mount;
4228                         continue;
4229                 }
4230
4231                 /*
4232                  * Prepend the path segment
4233                  */
4234                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4235                         if (bp == buf) {
4236                                 numfullpathfailsz++;
4237                                 kfree(buf, M_TEMP);
4238                                 error = ENOMEM;
4239                                 goto done;
4240                         }
4241                         *--bp = ncp->nc_name[i];
4242                 }
4243                 if (bp == buf) {
4244                         numfullpathfailsz++;
4245                         kfree(buf, M_TEMP);
4246                         error = ENOMEM;
4247                         goto done;
4248                 }
4249                 *--bp = '/';
4250                 slash_prefixed = 1;
4251
4252                 /*
4253                  * Go up a directory.  This isn't a mount point so we don't
4254                  * have to check again.
4255                  *
4256                  * We can only safely access nc_parent with ncp held locked.
4257                  */
4258                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4259                         _cache_lock(ncp);
4260                         if (nch.ncp != ncp->nc_parent) {
4261                                 _cache_unlock(ncp);
4262                                 continue;
4263                         }
4264                         _cache_hold(nch.ncp);
4265                         _cache_unlock(ncp);
4266                         break;
4267                 }
4268                 _cache_drop(ncp);
4269                 ncp = nch.ncp;
4270         }
4271         if (ncp == NULL) {
4272                 numfullpathfailnf++;
4273                 kfree(buf, M_TEMP);
4274                 error = ENOENT;
4275                 goto done;
4276         }
4277
4278         if (!slash_prefixed) {
4279                 if (bp == buf) {
4280                         numfullpathfailsz++;
4281                         kfree(buf, M_TEMP);
4282                         error = ENOMEM;
4283                         goto done;
4284                 }
4285                 *--bp = '/';
4286         }
4287         numfullpathfound++;
4288         *retbuf = bp;
4289         *freebuf = buf;
4290         error = 0;
4291 done:
4292         if (ncp)
4293                 _cache_drop(ncp);
4294         return(error);
4295 }
4296
4297 int
4298 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4299             char **freebuf, int guess)
4300 {
4301         struct namecache *ncp;
4302         struct nchandle nch;
4303         int error;
4304
4305         *freebuf = NULL;
4306         atomic_add_int(&numfullpathcalls, 1);
4307         if (disablefullpath)
4308                 return (ENODEV);
4309
4310         if (p == NULL)
4311                 return (EINVAL);
4312
4313         /* vn is NULL, client wants us to use p->p_textvp */
4314         if (vn == NULL) {
4315                 if ((vn = p->p_textvp) == NULL)
4316                         return (EINVAL);
4317         }
4318         spin_lock_shared(&vn->v_spin);
4319         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4320                 if (ncp->nc_nlen)
4321                         break;
4322         }
4323         if (ncp == NULL) {
4324                 spin_unlock_shared(&vn->v_spin);
4325                 return (EINVAL);
4326         }
4327         _cache_hold(ncp);
4328         spin_unlock_shared(&vn->v_spin);
4329
4330         atomic_add_int(&numfullpathcalls, -1);
4331         nch.ncp = ncp;
4332         nch.mount = vn->v_mount;
4333         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4334         _cache_drop(ncp);
4335         return (error);
4336 }