sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/mount.h>
  70 #include <sys/vnode.h>
  71 #include <sys/malloc.h>
  72 #include <sys/sysproto.h>
  73 #include <sys/spinlock.h>
  74 #include <sys/proc.h>
  75 #include <sys/namei.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/spinlock2.h>
  85
  86 #define MAX_RECURSION_DEPTH     64
  87
  88 /*
  89  * Random lookups in the cache are accomplished with a hash table using
  90  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
  91  *
  92  * Negative entries may exist and correspond to resolved namecache
  93  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  94  * will be set if the entry corresponds to a whited-out directory entry
  95  * (verses simply not finding the entry at all).   pcpu_ncache[n].neg_list
  96  * is locked via pcpu_ncache[n].neg_spin;
  97  *
  98  * MPSAFE RULES:
  99  *
 100  * (1) A ncp must be referenced before it can be locked.
 101  *
 102  * (2) A ncp must be locked in order to modify it.
 103  *
 104  * (3) ncp locks are always ordered child -> parent.  That may seem
 105  *     backwards but forward scans use the hash table and thus can hold
 106  *     the parent unlocked when traversing downward.
 107  *
 108  *     This allows insert/rename/delete/dot-dot and other operations
 109  *     to use ncp->nc_parent links.
 110  *
 111  *     This also prevents a locked up e.g. NFS node from creating a
 112  *     chain reaction all the way back to the root vnode / namecache.
 113  *
 114  * (4) parent linkages require both the parent and child to be locked.
 115  */
 116
 117 /*
 118  * Structures associated with name cacheing.
 119  */
 120 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 121 #define MINNEG                  1024
 122 #define MINPOS                  1024
 123 #define NCMOUNT_NUMCACHE        16301   /* prime number */
 124
 125 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 126
 127 LIST_HEAD(nchash_list, namecache);
 128
 129 /*
 130  * Don't cachealign, but at least pad to 32 bytes so entries
 131  * don't cross a cache line.
 132  */
 133 struct nchash_head {
 134        struct nchash_list list; /* 16 bytes */
 135        struct spinlock  spin;   /* 8 bytes */
 136        long     pad01;          /* 8 bytes */
 137 };
 138
 139 struct ncmount_cache {
 140         struct spinlock spin;
 141         struct namecache *ncp;
 142         struct mount *mp;
 143         int isneg;              /* if != 0 mp is originator and not target */
 144 } __cachealign;
 145
 146 struct pcpu_ncache {
 147         struct spinlock         neg_spin;       /* for neg_list and neg_count */
 148         struct namecache_list   neg_list;
 149         long                    neg_count;
 150         long                    vfscache_negs;
 151         long                    vfscache_count;
 152         long                    vfscache_leafs;
 153 } __cachealign;
 154
 155 static struct nchash_head       *nchashtbl;
 156 static struct pcpu_ncache       *pcpu_ncache;
 157 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 158
 159 /*
 160  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 161  * to create the namecache infrastructure leading to a dangling vnode.
 162  *
 163  * 0    Only errors are reported
 164  * 1    Successes are reported
 165  * 2    Successes + the whole directory scan is reported
 166  * 3    Force the directory scan code run as if the parent vnode did not
 167  *      have a namecache record, even if it does have one.
 168  */
 169 static int      ncvp_debug;
 170 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 171     "Namecache debug level (0-3)");
 172
 173 static u_long   nchash;                 /* size of hash table */
 174 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 175     "Size of namecache hash table");
 176
 177 static int      ncnegflush = 10;        /* burst for negative flush */
 178 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 179     "Batch flush negative entries");
 180
 181 static int      ncposflush = 10;        /* burst for positive flush */
 182 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 183     "Batch flush positive entries");
 184
 185 static int      ncnegfactor = 16;       /* ratio of negative entries */
 186 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 187     "Ratio of namecache negative entries");
 188
 189 static int      nclockwarn;             /* warn on locked entries in ticks */
 190 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 191     "Warn on locked namecache entries in ticks");
 192
 193 static int      numdefered;             /* number of cache entries allocated */
 194 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 195     "Number of cache entries allocated");
 196
 197 static int      ncposlimit;             /* number of cache entries allocated */
 198 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 199     "Number of cache entries allocated");
 200
 201 static int      ncp_shared_lock_disable = 0;
 202 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 203            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 204
 205 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 206     "sizeof(struct vnode)");
 207 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 208     "sizeof(struct namecache)");
 209
 210 static int      ncmount_cache_enable = 1;
 211 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 212            &ncmount_cache_enable, 0, "mount point cache");
 213
 214 static __inline void _cache_drop(struct namecache *ncp);
 215 static int cache_resolve_mp(struct mount *mp);
 216 static struct vnode *cache_dvpref(struct namecache *ncp);
 217 static void _cache_lock(struct namecache *ncp);
 218 static void _cache_setunresolved(struct namecache *ncp);
 219 static void _cache_cleanneg(long count);
 220 static void _cache_cleanpos(long count);
 221 static void _cache_cleandefered(void);
 222 static void _cache_unlink(struct namecache *ncp);
 223 #if 0
 224 static void vfscache_rollup_all(void);
 225 #endif
 226
 227 /*
 228  * The new name cache statistics
 229  */
 230 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 231 static long vfscache_negs;
 232 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
 233     "Number of negative namecache entries");
 234 static long vfscache_count;
 235 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
 236     "Number of namecaches entries");
 237 static long vfscache_leafs;
 238 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
 239     "Number of namecaches entries");
 240
 241 struct nchstats nchstats[SMP_MAXCPU];
 242 /*
 243  * Export VFS cache effectiveness statistics to user-land.
 244  *
 245  * The statistics are left for aggregation to user-land so
 246  * neat things can be achieved, like observing per-CPU cache
 247  * distribution.
 248  */
 249 static int
 250 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 251 {
 252         struct globaldata *gd;
 253         int i, error;
 254
 255         error = 0;
 256         for (i = 0; i < ncpus; ++i) {
 257                 gd = globaldata_find(i);
 258                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 259                         sizeof(struct nchstats))))
 260                         break;
 261         }
 262
 263         return (error);
 264 }
 265 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 266   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 267
 268 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 269
 270 /*
 271  * Cache mount points and namecache records in order to avoid unnecessary
 272  * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
 273  * performance and is particularly important on multi-socket systems to
 274  * reduce cache-line ping-ponging.
 275  *
 276  * Try to keep the pcpu structure within one cache line (~64 bytes).
 277  */
 278 #define MNTCACHE_COUNT      5
 279
 280 struct mntcache {
 281         struct mount    *mntary[MNTCACHE_COUNT];
 282         struct namecache *ncp1;
 283         struct namecache *ncp2;
 284         struct nchandle  ncdir;
 285         int             iter;
 286         int             unused01;
 287 } __cachealign;
 288
 289 static struct mntcache  pcpu_mntcache[MAXCPU];
 290
 291 static
 292 void
 293 _cache_mntref(struct mount *mp)
 294 {
 295         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
 296         int i;
 297
 298         for (i = 0; i < MNTCACHE_COUNT; ++i) {
 299                 if (cache->mntary[i] != mp)
 300                         continue;
 301                 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL))
 302                         return;
 303         }
 304         atomic_add_int(&mp->mnt_refs, 1);
 305 }
 306
 307 static
 308 void
 309 _cache_mntrel(struct mount *mp)
 310 {
 311         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
 312         int i;
 313
 314         for (i = 0; i < MNTCACHE_COUNT; ++i) {
 315                 if (cache->mntary[i] == NULL) {
 316                         mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
 317                         if (mp == NULL)
 318                                 return;
 319                 }
 320         }
 321         i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT);
 322         mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
 323         if (mp)
 324                 atomic_add_int(&mp->mnt_refs, -1);
 325 }
 326
 327 /*
 328  * Clears all cached mount points on all cpus.  This routine should only
 329  * be called when we are waiting for a mount to clear, e.g. so we can
 330  * unmount.
 331  */
 332 void
 333 cache_clearmntcache(void)
 334 {
 335         int n;
 336
 337         for (n = 0; n < ncpus; ++n) {
 338                 struct mntcache *cache = &pcpu_mntcache[n];
 339                 struct namecache *ncp;
 340                 struct mount *mp;
 341                 int i;
 342
 343                 for (i = 0; i < MNTCACHE_COUNT; ++i) {
 344                         if (cache->mntary[i]) {
 345                                 mp = atomic_swap_ptr(
 346                                         (void *)&cache->mntary[i], NULL);
 347                                 if (mp)
 348                                         atomic_add_int(&mp->mnt_refs, -1);
 349                         }
 350                 }
 351                 if (cache->ncp1) {
 352                         ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL);
 353                         if (ncp)
 354                                 _cache_drop(ncp);
 355                 }
 356                 if (cache->ncp2) {
 357                         ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL);
 358                         if (ncp)
 359                                 _cache_drop(ncp);
 360                 }
 361                 if (cache->ncdir.ncp) {
 362                         ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL);
 363                         if (ncp)
 364                                 _cache_drop(ncp);
 365                 }
 366                 if (cache->ncdir.mount) {
 367                         mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL);
 368                         if (mp)
 369                                 atomic_add_int(&mp->mnt_refs, -1);
 370                 }
 371         }
 372 }
 373
 374
 375 /*
 376  * Namespace locking.  The caller must already hold a reference to the
 377  * namecache structure in order to lock/unlock it.  This function prevents
 378  * the namespace from being created or destroyed by accessors other then
 379  * the lock holder.
 380  *
 381  * Note that holding a locked namecache structure prevents other threads
 382  * from making namespace changes (e.g. deleting or creating), prevents
 383  * vnode association state changes by other threads, and prevents the
 384  * namecache entry from being resolved or unresolved by other threads.
 385  *
 386  * An exclusive lock owner has full authority to associate/disassociate
 387  * vnodes and resolve/unresolve the locked ncp.
 388  *
 389  * A shared lock owner only has authority to acquire the underlying vnode,
 390  * if any.
 391  *
 392  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 393  * fact (when locking) or cleared prior to unlocking.
 394  *
 395  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 396  *           or recycled, but it does NOT help you if the vnode had already
 397  *           initiated a recyclement.  If this is important, use cache_get()
 398  *           rather then cache_lock() (and deal with the differences in the
 399  *           way the refs counter is handled).  Or, alternatively, make an
 400  *           unconditional call to cache_validate() or cache_resolve()
 401  *           after cache_lock() returns.
 402  */
 403 static
 404 void
 405 _cache_lock(struct namecache *ncp)
 406 {
 407         thread_t td;
 408         int didwarn;
 409         int begticks;
 410         int error;
 411         u_int count;
 412
 413         KKASSERT(ncp->nc_refs != 0);
 414         didwarn = 0;
 415         begticks = 0;
 416         td = curthread;
 417
 418         for (;;) {
 419                 count = ncp->nc_lockstatus;
 420                 cpu_ccfence();
 421
 422                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 423                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 424                                               count, count + 1)) {
 425                                 /*
 426                                  * The vp associated with a locked ncp must
 427                                  * be held to prevent it from being recycled.
 428                                  *
 429                                  * WARNING!  If VRECLAIMED is set the vnode
 430                                  * could already be in the middle of a recycle.
 431                                  * Callers must use cache_vref() or
 432                                  * cache_vget() on the locked ncp to
 433                                  * validate the vp or set the cache entry
 434                                  * to unresolved.
 435                                  *
 436                                  * NOTE! vhold() is allowed if we hold a
 437                                  *       lock on the ncp (which we do).
 438                                  */
 439                                 ncp->nc_locktd = td;
 440                                 if (ncp->nc_vp)
 441                                         vhold(ncp->nc_vp);
 442                                 break;
 443                         }
 444                         /* cmpset failed */
 445                         continue;
 446                 }
 447                 if (ncp->nc_locktd == td) {
 448                         KKASSERT((count & NC_SHLOCK_FLAG) == 0);
 449                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 450                                               count, count + 1)) {
 451                                 break;
 452                         }
 453                         /* cmpset failed */
 454                         continue;
 455                 }
 456                 tsleep_interlock(&ncp->nc_locktd, 0);
 457                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 458                                       count | NC_EXLOCK_REQ) == 0) {
 459                         /* cmpset failed */
 460                         continue;
 461                 }
 462                 if (begticks == 0)
 463                         begticks = ticks;
 464                 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
 465                                "clock", nclockwarn);
 466                 if (error == EWOULDBLOCK) {
 467                         if (didwarn == 0) {
 468                                 didwarn = ticks;
 469                                 kprintf("[diagnostic] cache_lock: "
 470                                         "%s blocked on %p %08x",
 471                                         td->td_comm, ncp, count);
 472                                 kprintf(" \"%*.*s\"\n",
 473                                         ncp->nc_nlen, ncp->nc_nlen,
 474                                         ncp->nc_name);
 475                         }
 476                 }
 477                 /* loop */
 478         }
 479         if (didwarn) {
 480                 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after "
 481                         "%d secs\n",
 482                         td->td_comm,
 483                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 484                         (int)(ticks + (hz / 2) - begticks) / hz);
 485         }
 486 }
 487
 488 /*
 489  * The shared lock works similarly to the exclusive lock except
 490  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 491  * prevent vhold() races, since the moment our cmpset_int succeeds
 492  * another cpu can come in and get its own shared lock.
 493  *
 494  * A critical section is needed to prevent interruption during the
 495  * VHOLD interlock.
 496  */
 497 static
 498 void
 499 _cache_lock_shared(struct namecache *ncp)
 500 {
 501         int didwarn;
 502         int error;
 503         u_int count;
 504         u_int optreq = NC_EXLOCK_REQ;
 505
 506         KKASSERT(ncp->nc_refs != 0);
 507         didwarn = 0;
 508
 509         for (;;) {
 510                 count = ncp->nc_lockstatus;
 511                 cpu_ccfence();
 512
 513                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 514                         crit_enter();
 515                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 516                                       count,
 517                                       (count + 1) | NC_SHLOCK_FLAG |
 518                                                     NC_SHLOCK_VHOLD)) {
 519                                 /*
 520                                  * The vp associated with a locked ncp must
 521                                  * be held to prevent it from being recycled.
 522                                  *
 523                                  * WARNING!  If VRECLAIMED is set the vnode
 524                                  * could already be in the middle of a recycle.
 525                                  * Callers must use cache_vref() or
 526                                  * cache_vget() on the locked ncp to
 527                                  * validate the vp or set the cache entry
 528                                  * to unresolved.
 529                                  *
 530                                  * NOTE! vhold() is allowed if we hold a
 531                                  *       lock on the ncp (which we do).
 532                                  */
 533                                 if (ncp->nc_vp)
 534                                         vhold(ncp->nc_vp);
 535                                 atomic_clear_int(&ncp->nc_lockstatus,
 536                                                  NC_SHLOCK_VHOLD);
 537                                 crit_exit();
 538                                 break;
 539                         }
 540                         /* cmpset failed */
 541                         crit_exit();
 542                         continue;
 543                 }
 544
 545                 /*
 546                  * If already held shared we can just bump the count, but
 547                  * only allow this if nobody is trying to get the lock
 548                  * exclusively.  If we are blocking too long ignore excl
 549                  * requests (which can race/deadlock us).
 550                  *
 551                  * VHOLD is a bit of a hack.  Even though we successfully
 552                  * added another shared ref, the cpu that got the first
 553                  * shared ref might not yet have held the vnode.
 554                  */
 555                 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) {
 556                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 557                                             NC_SHLOCK_REQ |
 558                                             NC_SHLOCK_FLAG)) > 0);
 559                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 560                                               count, count + 1)) {
 561                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 562                                         cpu_pause();
 563                                 break;
 564                         }
 565                         continue;
 566                 }
 567                 tsleep_interlock(ncp, 0);
 568                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 569                                       count | NC_SHLOCK_REQ) == 0) {
 570                         /* cmpset failed */
 571                         continue;
 572                 }
 573                 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
 574                 if (error == EWOULDBLOCK) {
 575                         optreq = 0;
 576                         if (didwarn == 0) {
 577                                 didwarn = ticks - nclockwarn;
 578                                 kprintf("[diagnostic] cache_lock_shared: "
 579                                         "%s blocked on %p %08x "
 580                                         "\"%*.*s\"\n",
 581                                         curthread->td_comm, ncp, count,
 582                                         ncp->nc_nlen, ncp->nc_nlen,
 583                                         ncp->nc_name);
 584                         }
 585                 }
 586                 /* loop */
 587         }
 588         if (didwarn) {
 589                 kprintf("[diagnostic] cache_lock_shared: "
 590                         "%s unblocked %*.*s after %d secs\n",
 591                         curthread->td_comm,
 592                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 593                         (int)(ticks - didwarn) / hz);
 594         }
 595 }
 596
 597 /*
 598  * Lock ncp exclusively, return 0 on success.
 599  *
 600  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
 601  *       such as the case where one of its children is locked.
 602  */
 603 static
 604 int
 605 _cache_lock_nonblock(struct namecache *ncp)
 606 {
 607         thread_t td;
 608         u_int count;
 609
 610         td = curthread;
 611
 612         for (;;) {
 613                 count = ncp->nc_lockstatus;
 614
 615                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 616                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 617                                               count, count + 1)) {
 618                                 /*
 619                                  * The vp associated with a locked ncp must
 620                                  * be held to prevent it from being recycled.
 621                                  *
 622                                  * WARNING!  If VRECLAIMED is set the vnode
 623                                  * could already be in the middle of a recycle.
 624                                  * Callers must use cache_vref() or
 625                                  * cache_vget() on the locked ncp to
 626                                  * validate the vp or set the cache entry
 627                                  * to unresolved.
 628                                  *
 629                                  * NOTE! vhold() is allowed if we hold a
 630                                  *       lock on the ncp (which we do).
 631                                  */
 632                                 ncp->nc_locktd = td;
 633                                 if (ncp->nc_vp)
 634                                         vhold(ncp->nc_vp);
 635                                 break;
 636                         }
 637                         /* cmpset failed */
 638                         continue;
 639                 }
 640                 if (ncp->nc_locktd == td) {
 641                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 642                                               count, count + 1)) {
 643                                 break;
 644                         }
 645                         /* cmpset failed */
 646                         continue;
 647                 }
 648                 return(EWOULDBLOCK);
 649         }
 650         return(0);
 651 }
 652
 653 /*
 654  * The shared lock works similarly to the exclusive lock except
 655  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 656  * prevent vhold() races, since the moment our cmpset_int succeeds
 657  * another cpu can come in and get its own shared lock.
 658  *
 659  * A critical section is needed to prevent interruption during the
 660  * VHOLD interlock.
 661  */
 662 static
 663 int
 664 _cache_lock_shared_nonblock(struct namecache *ncp)
 665 {
 666         u_int count;
 667
 668         for (;;) {
 669                 count = ncp->nc_lockstatus;
 670
 671                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 672                         crit_enter();
 673                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 674                                       count,
 675                                       (count + 1) | NC_SHLOCK_FLAG |
 676                                                     NC_SHLOCK_VHOLD)) {
 677                                 /*
 678                                  * The vp associated with a locked ncp must
 679                                  * be held to prevent it from being recycled.
 680                                  *
 681                                  * WARNING!  If VRECLAIMED is set the vnode
 682                                  * could already be in the middle of a recycle.
 683                                  * Callers must use cache_vref() or
 684                                  * cache_vget() on the locked ncp to
 685                                  * validate the vp or set the cache entry
 686                                  * to unresolved.
 687                                  *
 688                                  * NOTE! vhold() is allowed if we hold a
 689                                  *       lock on the ncp (which we do).
 690                                  */
 691                                 if (ncp->nc_vp)
 692                                         vhold(ncp->nc_vp);
 693                                 atomic_clear_int(&ncp->nc_lockstatus,
 694                                                  NC_SHLOCK_VHOLD);
 695                                 crit_exit();
 696                                 break;
 697                         }
 698                         /* cmpset failed */
 699                         crit_exit();
 700                         continue;
 701                 }
 702
 703                 /*
 704                  * If already held shared we can just bump the count, but
 705                  * only allow this if nobody is trying to get the lock
 706                  * exclusively.
 707                  *
 708                  * VHOLD is a bit of a hack.  Even though we successfully
 709                  * added another shared ref, the cpu that got the first
 710                  * shared ref might not yet have held the vnode.
 711                  */
 712                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
 713                     NC_SHLOCK_FLAG) {
 714                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 715                                             NC_SHLOCK_REQ |
 716                                             NC_SHLOCK_FLAG)) > 0);
 717                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 718                                               count, count + 1)) {
 719                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 720                                         cpu_pause();
 721                                 break;
 722                         }
 723                         continue;
 724                 }
 725                 return(EWOULDBLOCK);
 726         }
 727         return(0);
 728 }
 729
 730 /*
 731  * Helper function
 732  *
 733  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
 734  *
 735  *       nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
 736  */
 737 static
 738 void
 739 _cache_unlock(struct namecache *ncp)
 740 {
 741         thread_t td __debugvar = curthread;
 742         u_int count;
 743         u_int ncount;
 744         struct vnode *dropvp;
 745
 746         KKASSERT(ncp->nc_refs >= 0);
 747         KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
 748         KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
 749
 750         count = ncp->nc_lockstatus;
 751         cpu_ccfence();
 752
 753         /*
 754          * Clear nc_locktd prior to the atomic op (excl lock only)
 755          */
 756         if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
 757                 ncp->nc_locktd = NULL;
 758         dropvp = NULL;
 759
 760         for (;;) {
 761                 if ((count &
 762                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
 763                         dropvp = ncp->nc_vp;
 764                         if (count & NC_EXLOCK_REQ)
 765                                 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
 766                         else
 767                                 ncount = 0;
 768
 769                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 770                                               count, ncount)) {
 771                                 if (count & NC_EXLOCK_REQ)
 772                                         wakeup(&ncp->nc_locktd);
 773                                 else if (count & NC_SHLOCK_REQ)
 774                                         wakeup(ncp);
 775                                 break;
 776                         }
 777                         dropvp = NULL;
 778                 } else {
 779                         KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
 780                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 781                                             NC_SHLOCK_REQ |
 782                                             NC_SHLOCK_FLAG)) > 1);
 783                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 784                                               count, count - 1)) {
 785                                 break;
 786                         }
 787                 }
 788                 count = ncp->nc_lockstatus;
 789                 cpu_ccfence();
 790         }
 791
 792         /*
 793          * Don't actually drop the vp until we successfully clean out
 794          * the lock, otherwise we may race another shared lock.
 795          */
 796         if (dropvp)
 797                 vdrop(dropvp);
 798 }
 799
 800 static
 801 int
 802 _cache_lockstatus(struct namecache *ncp)
 803 {
 804         if (ncp->nc_locktd == curthread)
 805                 return(LK_EXCLUSIVE);
 806         if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
 807                 return(LK_SHARED);
 808         return(-1);
 809 }
 810
 811 /*
 812  * cache_hold() and cache_drop() prevent the premature deletion of a
 813  * namecache entry but do not prevent operations (such as zapping) on
 814  * that namecache entry.
 815  *
 816  * This routine may only be called from outside this source module if
 817  * nc_refs is already at least 1.
 818  *
 819  * This is a rare case where callers are allowed to hold a spinlock,
 820  * so we can't ourselves.
 821  */
 822 static __inline
 823 struct namecache *
 824 _cache_hold(struct namecache *ncp)
 825 {
 826         atomic_add_int(&ncp->nc_refs, 1);
 827         return(ncp);
 828 }
 829
 830 /*
 831  * Drop a cache entry, taking care to deal with races.
 832  *
 833  * For potential 1->0 transitions we must hold the ncp lock to safely
 834  * test its flags.  An unresolved entry with no children must be zapped
 835  * to avoid leaks.
 836  *
 837  * The call to cache_zap() itself will handle all remaining races and
 838  * will decrement the ncp's refs regardless.  If we are resolved or
 839  * have children nc_refs can safely be dropped to 0 without having to
 840  * zap the entry.
 841  *
 842  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
 843  *
 844  * NOTE: cache_zap() may return a non-NULL referenced parent which must
 845  *       be dropped in a loop.
 846  */
 847 static __inline
 848 void
 849 _cache_drop(struct namecache *ncp)
 850 {
 851         int refs;
 852
 853         while (ncp) {
 854                 KKASSERT(ncp->nc_refs > 0);
 855                 refs = ncp->nc_refs;
 856
 857                 if (refs == 1) {
 858                         if (_cache_lock_nonblock(ncp) == 0) {
 859                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 860                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
 861                                     TAILQ_EMPTY(&ncp->nc_list)) {
 862                                         ncp = cache_zap(ncp, 1);
 863                                         continue;
 864                                 }
 865                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
 866                                         _cache_unlock(ncp);
 867                                         break;
 868                                 }
 869                                 _cache_unlock(ncp);
 870                         }
 871                 } else {
 872                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
 873                                 break;
 874                 }
 875                 cpu_pause();
 876         }
 877 }
 878
 879 /*
 880  * Link a new namecache entry to its parent and to the hash table.  Be
 881  * careful to avoid races if vhold() blocks in the future.
 882  *
 883  * Both ncp and par must be referenced and locked.
 884  *
 885  * NOTE: The hash table spinlock is held during this call, we can't do
 886  *       anything fancy.
 887  */
 888 static void
 889 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 890                    struct nchash_head *nchpp)
 891 {
 892         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 893
 894         KKASSERT(ncp->nc_parent == NULL);
 895         ncp->nc_parent = par;
 896         ncp->nc_head = nchpp;
 897
 898         /*
 899          * Set inheritance flags.  Note that the parent flags may be
 900          * stale due to getattr potentially not having been run yet
 901          * (it gets run during nlookup()'s).
 902          */
 903         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 904         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 905                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 906         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 907                 ncp->nc_flag |= NCF_UF_PCACHE;
 908
 909         /*
 910          * Add to hash table and parent, adjust accounting
 911          */
 912         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 913         atomic_add_long(&pn->vfscache_count, 1);
 914         if (TAILQ_EMPTY(&ncp->nc_list))
 915                 atomic_add_long(&pn->vfscache_leafs, 1);
 916
 917         if (TAILQ_EMPTY(&par->nc_list)) {
 918                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 919                 atomic_add_long(&pn->vfscache_leafs, -1);
 920                 /*
 921                  * Any vp associated with an ncp which has children must
 922                  * be held to prevent it from being recycled.
 923                  */
 924                 if (par->nc_vp)
 925                         vhold(par->nc_vp);
 926         } else {
 927                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 928         }
 929 }
 930
 931 /*
 932  * Remove the parent and hash associations from a namecache structure.
 933  * If this is the last child of the parent the cache_drop(par) will
 934  * attempt to recursively zap the parent.
 935  *
 936  * ncp must be locked.  This routine will acquire a temporary lock on
 937  * the parent as wlel as the appropriate hash chain.
 938  */
 939 static void
 940 _cache_unlink_parent(struct namecache *ncp)
 941 {
 942         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 943         struct namecache *par;
 944         struct vnode *dropvp;
 945
 946         if ((par = ncp->nc_parent) != NULL) {
 947                 KKASSERT(ncp->nc_parent == par);
 948                 _cache_hold(par);
 949                 _cache_lock(par);
 950                 spin_lock(&ncp->nc_head->spin);
 951
 952                 /*
 953                  * Remove from hash table and parent, adjust accounting
 954                  */
 955                 LIST_REMOVE(ncp, nc_hash);
 956                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 957                 atomic_add_long(&pn->vfscache_count, -1);
 958                 if (TAILQ_EMPTY(&ncp->nc_list))
 959                         atomic_add_long(&pn->vfscache_leafs, -1);
 960
 961                 dropvp = NULL;
 962                 if (TAILQ_EMPTY(&par->nc_list)) {
 963                         atomic_add_long(&pn->vfscache_leafs, 1);
 964                         if (par->nc_vp)
 965                                 dropvp = par->nc_vp;
 966                 }
 967                 spin_unlock(&ncp->nc_head->spin);
 968                 ncp->nc_parent = NULL;
 969                 ncp->nc_head = NULL;
 970                 _cache_unlock(par);
 971                 _cache_drop(par);
 972
 973                 /*
 974                  * We can only safely vdrop with no spinlocks held.
 975                  */
 976                 if (dropvp)
 977                         vdrop(dropvp);
 978         }
 979 }
 980
 981 /*
 982  * Allocate a new namecache structure.  Most of the code does not require
 983  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 984  */
 985 static struct namecache *
 986 cache_alloc(int nlen)
 987 {
 988         struct namecache *ncp;
 989
 990         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 991         if (nlen)
 992                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 993         ncp->nc_nlen = nlen;
 994         ncp->nc_flag = NCF_UNRESOLVED;
 995         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 996         ncp->nc_refs = 1;
 997
 998         TAILQ_INIT(&ncp->nc_list);
 999         _cache_lock(ncp);
1000         return(ncp);
1001 }
1002
1003 /*
1004  * Can only be called for the case where the ncp has never been
1005  * associated with anything (so no spinlocks are needed).
1006  */
1007 static void
1008 _cache_free(struct namecache *ncp)
1009 {
1010         KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
1011         if (ncp->nc_name)
1012                 kfree(ncp->nc_name, M_VFSCACHE);
1013         kfree(ncp, M_VFSCACHE);
1014 }
1015
1016 /*
1017  * [re]initialize a nchandle.
1018  */
1019 void
1020 cache_zero(struct nchandle *nch)
1021 {
1022         nch->ncp = NULL;
1023         nch->mount = NULL;
1024 }
1025
1026 /*
1027  * Ref and deref a namecache structure.
1028  *
1029  * The caller must specify a stable ncp pointer, typically meaning the
1030  * ncp is already referenced but this can also occur indirectly through
1031  * e.g. holding a lock on a direct child.
1032  *
1033  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
1034  *          use read spinlocks here.
1035  */
1036 struct nchandle *
1037 cache_hold(struct nchandle *nch)
1038 {
1039         _cache_hold(nch->ncp);
1040         _cache_mntref(nch->mount);
1041         return(nch);
1042 }
1043
1044 /*
1045  * Create a copy of a namecache handle for an already-referenced
1046  * entry.
1047  */
1048 void
1049 cache_copy(struct nchandle *nch, struct nchandle *target)
1050 {
1051         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1052         struct namecache *ncp;
1053
1054         *target = *nch;
1055         _cache_mntref(target->mount);
1056         ncp = target->ncp;
1057         if (ncp) {
1058                 if (ncp == cache->ncp1) {
1059                         if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL))
1060                                 return;
1061                 }
1062                 if (ncp == cache->ncp2) {
1063                         if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL))
1064                                 return;
1065                 }
1066                 _cache_hold(ncp);
1067         }
1068 }
1069
1070 /*
1071  * Caller wants to copy the current directory, copy it out from our
1072  * pcpu cache if possible (the entire critical path is just two localized
1073  * cmpset ops).  If the pcpu cache has a snapshot at all it will be a
1074  * valid one, so we don't have to lock p->p_fd even though we are loading
1075  * two fields.
1076  *
1077  * This has a limited effect since nlookup must still ref and shlock the
1078  * vnode to check perms.  We do avoid the per-proc spin-lock though, which
1079  * can aid threaded programs.
1080  */
1081 void
1082 cache_copy_ncdir(struct proc *p, struct nchandle *target)
1083 {
1084         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1085
1086         *target = p->p_fd->fd_ncdir;
1087         if (target->ncp == cache->ncdir.ncp &&
1088             target->mount == cache->ncdir.mount) {
1089                 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp,
1090                                       target->ncp, NULL)) {
1091                         if (atomic_cmpset_ptr((void *)&cache->ncdir.mount,
1092                                               target->mount, NULL)) {
1093                                 /* CRITICAL PATH */
1094                                 return;
1095                         }
1096                         _cache_drop(target->ncp);
1097                 }
1098         }
1099         spin_lock_shared(&p->p_fd->fd_spin);
1100         cache_copy(&p->p_fd->fd_ncdir, target);
1101         spin_unlock_shared(&p->p_fd->fd_spin);
1102 }
1103
1104 void
1105 cache_changemount(struct nchandle *nch, struct mount *mp)
1106 {
1107         _cache_mntref(mp);
1108         _cache_mntrel(nch->mount);
1109         nch->mount = mp;
1110 }
1111
1112 void
1113 cache_drop(struct nchandle *nch)
1114 {
1115         _cache_mntrel(nch->mount);
1116         _cache_drop(nch->ncp);
1117         nch->ncp = NULL;
1118         nch->mount = NULL;
1119 }
1120
1121 /*
1122  * Drop the nchandle, but try to cache the ref to avoid global atomic
1123  * ops.  This is typically done on the system root and jail root nchandles.
1124  */
1125 void
1126 cache_drop_and_cache(struct nchandle *nch)
1127 {
1128         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1129         struct namecache *ncp;
1130
1131         _cache_mntrel(nch->mount);
1132         ncp = nch->ncp;
1133         if (cache->ncp1 == NULL) {
1134                 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
1135                 if (ncp == NULL)
1136                         goto done;
1137         }
1138         if (cache->ncp2 == NULL) {
1139                 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
1140                 if (ncp == NULL)
1141                         goto done;
1142         }
1143         if (++cache->iter & 1)
1144                 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
1145         else
1146                 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
1147         if (ncp)
1148                 _cache_drop(ncp);
1149 done:
1150         nch->ncp = NULL;
1151         nch->mount = NULL;
1152 }
1153
1154 /*
1155  * We are dropping what the caller believes is the current directory,
1156  * unconditionally store it in our pcpu cache.  Anything already in
1157  * the cache will be discarded.
1158  */
1159 void
1160 cache_drop_ncdir(struct nchandle *nch)
1161 {
1162         struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1163
1164         nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp);
1165         nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount);
1166         if (nch->ncp)
1167                 _cache_drop(nch->ncp);
1168         if (nch->mount)
1169                 _cache_mntrel(nch->mount);
1170         nch->ncp = NULL;
1171         nch->mount = NULL;
1172 }
1173
1174 int
1175 cache_lockstatus(struct nchandle *nch)
1176 {
1177         return(_cache_lockstatus(nch->ncp));
1178 }
1179
1180 void
1181 cache_lock(struct nchandle *nch)
1182 {
1183         _cache_lock(nch->ncp);
1184 }
1185
1186 void
1187 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1188 {
1189         struct namecache *ncp = nch->ncp;
1190
1191         if (ncp_shared_lock_disable || excl ||
1192             (ncp->nc_flag & NCF_UNRESOLVED)) {
1193                 _cache_lock(ncp);
1194         } else {
1195                 _cache_lock_shared(ncp);
1196                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1197                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1198                                 _cache_unlock(ncp);
1199                                 _cache_lock(ncp);
1200                         }
1201                 } else {
1202                         _cache_unlock(ncp);
1203                         _cache_lock(ncp);
1204                 }
1205         }
1206 }
1207
1208 /*
1209  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
1210  * is responsible for checking both for validity on return as they
1211  * may have become invalid.
1212  *
1213  * We have to deal with potential deadlocks here, just ping pong
1214  * the lock until we get it (we will always block somewhere when
1215  * looping so this is not cpu-intensive).
1216  *
1217  * which = 0    nch1 not locked, nch2 is locked
1218  * which = 1    nch1 is locked, nch2 is not locked
1219  */
1220 void
1221 cache_relock(struct nchandle *nch1, struct ucred *cred1,
1222              struct nchandle *nch2, struct ucred *cred2)
1223 {
1224         int which;
1225
1226         which = 0;
1227
1228         for (;;) {
1229                 if (which == 0) {
1230                         if (cache_lock_nonblock(nch1) == 0) {
1231                                 cache_resolve(nch1, cred1);
1232                                 break;
1233                         }
1234                         cache_unlock(nch2);
1235                         cache_lock(nch1);
1236                         cache_resolve(nch1, cred1);
1237                         which = 1;
1238                 } else {
1239                         if (cache_lock_nonblock(nch2) == 0) {
1240                                 cache_resolve(nch2, cred2);
1241                                 break;
1242                         }
1243                         cache_unlock(nch1);
1244                         cache_lock(nch2);
1245                         cache_resolve(nch2, cred2);
1246                         which = 0;
1247                 }
1248         }
1249 }
1250
1251 int
1252 cache_lock_nonblock(struct nchandle *nch)
1253 {
1254         return(_cache_lock_nonblock(nch->ncp));
1255 }
1256
1257 void
1258 cache_unlock(struct nchandle *nch)
1259 {
1260         _cache_unlock(nch->ncp);
1261 }
1262
1263 /*
1264  * ref-and-lock, unlock-and-deref functions.
1265  *
1266  * This function is primarily used by nlookup.  Even though cache_lock
1267  * holds the vnode, it is possible that the vnode may have already
1268  * initiated a recyclement.
1269  *
1270  * We want cache_get() to return a definitively usable vnode or a
1271  * definitively unresolved ncp.
1272  */
1273 static
1274 struct namecache *
1275 _cache_get(struct namecache *ncp)
1276 {
1277         _cache_hold(ncp);
1278         _cache_lock(ncp);
1279         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1280                 _cache_setunresolved(ncp);
1281         return(ncp);
1282 }
1283
1284 /*
1285  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1286  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1287  * valid.  Otherwise an exclusive lock will be acquired instead.
1288  */
1289 static
1290 struct namecache *
1291 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1292 {
1293         if (ncp_shared_lock_disable || excl ||
1294             (ncp->nc_flag & NCF_UNRESOLVED)) {
1295                 return(_cache_get(ncp));
1296         }
1297         _cache_hold(ncp);
1298         _cache_lock_shared(ncp);
1299         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1300                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1301                         _cache_unlock(ncp);
1302                         ncp = _cache_get(ncp);
1303                         _cache_drop(ncp);
1304                 }
1305         } else {
1306                 _cache_unlock(ncp);
1307                 ncp = _cache_get(ncp);
1308                 _cache_drop(ncp);
1309         }
1310         return(ncp);
1311 }
1312
1313 /*
1314  * This is a special form of _cache_lock() which only succeeds if
1315  * it can get a pristine, non-recursive lock.  The caller must have
1316  * already ref'd the ncp.
1317  *
1318  * On success the ncp will be locked, on failure it will not.  The
1319  * ref count does not change either way.
1320  *
1321  * We want _cache_lock_special() (on success) to return a definitively
1322  * usable vnode or a definitively unresolved ncp.
1323  */
1324 static int
1325 _cache_lock_special(struct namecache *ncp)
1326 {
1327         if (_cache_lock_nonblock(ncp) == 0) {
1328                 if ((ncp->nc_lockstatus &
1329                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
1330                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1331                                 _cache_setunresolved(ncp);
1332                         return(0);
1333                 }
1334                 _cache_unlock(ncp);
1335         }
1336         return(EWOULDBLOCK);
1337 }
1338
1339 /*
1340  * This function tries to get a shared lock but will back-off to an exclusive
1341  * lock if:
1342  *
1343  * (1) Some other thread is trying to obtain an exclusive lock
1344  *     (to prevent the exclusive requester from getting livelocked out
1345  *     by many shared locks).
1346  *
1347  * (2) The current thread already owns an exclusive lock (to avoid
1348  *     deadlocking).
1349  *
1350  * WARNING! On machines with lots of cores we really want to try hard to
1351  *          get a shared lock or concurrent path lookups can chain-react
1352  *          into a very high-latency exclusive lock.
1353  */
1354 static int
1355 _cache_lock_shared_special(struct namecache *ncp)
1356 {
1357         /*
1358          * Only honor a successful shared lock (returning 0) if there is
1359          * no exclusive request pending and the vnode, if present, is not
1360          * in a reclaimed state.
1361          */
1362         if (_cache_lock_shared_nonblock(ncp) == 0) {
1363                 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) {
1364                         if (ncp->nc_vp == NULL ||
1365                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
1366                                 return(0);
1367                         }
1368                 }
1369                 _cache_unlock(ncp);
1370                 return(EWOULDBLOCK);
1371         }
1372
1373         /*
1374          * Non-blocking shared lock failed.  If we already own the exclusive
1375          * lock just acquire another exclusive lock (instead of deadlocking).
1376          * Otherwise acquire a shared lock.
1377          */
1378         if (ncp->nc_locktd == curthread) {
1379                 _cache_lock(ncp);
1380                 return(0);
1381         }
1382         _cache_lock_shared(ncp);
1383         return(0);
1384 }
1385
1386
1387 /*
1388  * NOTE: The same nchandle can be passed for both arguments.
1389  */
1390 void
1391 cache_get(struct nchandle *nch, struct nchandle *target)
1392 {
1393         KKASSERT(nch->ncp->nc_refs > 0);
1394         target->mount = nch->mount;
1395         target->ncp = _cache_get(nch->ncp);
1396         _cache_mntref(target->mount);
1397 }
1398
1399 void
1400 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1401 {
1402         KKASSERT(nch->ncp->nc_refs > 0);
1403         target->mount = nch->mount;
1404         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1405         _cache_mntref(target->mount);
1406 }
1407
1408 /*
1409  *
1410  */
1411 static __inline
1412 void
1413 _cache_put(struct namecache *ncp)
1414 {
1415         _cache_unlock(ncp);
1416         _cache_drop(ncp);
1417 }
1418
1419 /*
1420  *
1421  */
1422 void
1423 cache_put(struct nchandle *nch)
1424 {
1425         _cache_mntrel(nch->mount);
1426         _cache_put(nch->ncp);
1427         nch->ncp = NULL;
1428         nch->mount = NULL;
1429 }
1430
1431 /*
1432  * Resolve an unresolved ncp by associating a vnode with it.  If the
1433  * vnode is NULL, a negative cache entry is created.
1434  *
1435  * The ncp should be locked on entry and will remain locked on return.
1436  */
1437 static
1438 void
1439 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1440 {
1441         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
1442         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1443
1444         if (vp != NULL) {
1445                 /*
1446                  * Any vp associated with an ncp which has children must
1447                  * be held.  Any vp associated with a locked ncp must be held.
1448                  */
1449                 if (!TAILQ_EMPTY(&ncp->nc_list))
1450                         vhold(vp);
1451                 spin_lock(&vp->v_spin);
1452                 ncp->nc_vp = vp;
1453                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1454                 spin_unlock(&vp->v_spin);
1455                 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1456                         vhold(vp);
1457
1458                 /*
1459                  * Set auxiliary flags
1460                  */
1461                 switch(vp->v_type) {
1462                 case VDIR:
1463                         ncp->nc_flag |= NCF_ISDIR;
1464                         break;
1465                 case VLNK:
1466                         ncp->nc_flag |= NCF_ISSYMLINK;
1467                         /* XXX cache the contents of the symlink */
1468                         break;
1469                 default:
1470                         break;
1471                 }
1472                 ncp->nc_error = 0;
1473                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
1474                  * implementation*/
1475                 if (mp != NULL)
1476                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1477                                 vp->v_pfsmp = mp;
1478         } else {
1479                 /*
1480                  * When creating a negative cache hit we set the
1481                  * namecache_gen.  A later resolve will clean out the
1482                  * negative cache hit if the mount point's namecache_gen
1483                  * has changed.  Used by devfs, could also be used by
1484                  * other remote FSs.
1485                  */
1486                 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1487
1488                 ncp->nc_vp = NULL;
1489                 ncp->nc_negcpu = mycpu->gd_cpuid;
1490                 spin_lock(&pn->neg_spin);
1491                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1492                 ++pn->neg_count;
1493                 spin_unlock(&pn->neg_spin);
1494                 atomic_add_long(&pn->vfscache_negs, 1);
1495
1496                 ncp->nc_error = ENOENT;
1497                 if (mp)
1498                         VFS_NCPGEN_SET(mp, ncp);
1499         }
1500         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1501 }
1502
1503 /*
1504  *
1505  */
1506 void
1507 cache_setvp(struct nchandle *nch, struct vnode *vp)
1508 {
1509         _cache_setvp(nch->mount, nch->ncp, vp);
1510 }
1511
1512 /*
1513  *
1514  */
1515 void
1516 cache_settimeout(struct nchandle *nch, int nticks)
1517 {
1518         struct namecache *ncp = nch->ncp;
1519
1520         if ((ncp->nc_timeout = ticks + nticks) == 0)
1521                 ncp->nc_timeout = 1;
1522 }
1523
1524 /*
1525  * Disassociate the vnode or negative-cache association and mark a
1526  * namecache entry as unresolved again.  Note that the ncp is still
1527  * left in the hash table and still linked to its parent.
1528  *
1529  * The ncp should be locked and refd on entry and will remain locked and refd
1530  * on return.
1531  *
1532  * This routine is normally never called on a directory containing children.
1533  * However, NFS often does just that in its rename() code as a cop-out to
1534  * avoid complex namespace operations.  This disconnects a directory vnode
1535  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1536  * sync.
1537  *
1538  */
1539 static
1540 void
1541 _cache_setunresolved(struct namecache *ncp)
1542 {
1543         struct vnode *vp;
1544
1545         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1546                 ncp->nc_flag |= NCF_UNRESOLVED;
1547                 ncp->nc_timeout = 0;
1548                 ncp->nc_error = ENOTCONN;
1549                 if ((vp = ncp->nc_vp) != NULL) {
1550                         spin_lock(&vp->v_spin);
1551                         ncp->nc_vp = NULL;
1552                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1553                         spin_unlock(&vp->v_spin);
1554
1555                         /*
1556                          * Any vp associated with an ncp with children is
1557                          * held by that ncp.  Any vp associated with a locked
1558                          * ncp is held by that ncp.  These conditions must be
1559                          * undone when the vp is cleared out from the ncp.
1560                          */
1561                         if (!TAILQ_EMPTY(&ncp->nc_list))
1562                                 vdrop(vp);
1563                         if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1564                                 vdrop(vp);
1565                 } else {
1566                         struct pcpu_ncache *pn;
1567
1568                         pn = &pcpu_ncache[ncp->nc_negcpu];
1569
1570                         atomic_add_long(&pn->vfscache_negs, -1);
1571                         spin_lock(&pn->neg_spin);
1572                         TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1573                         --pn->neg_count;
1574                         spin_unlock(&pn->neg_spin);
1575                 }
1576                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1577         }
1578 }
1579
1580 /*
1581  * The cache_nresolve() code calls this function to automatically
1582  * set a resolved cache element to unresolved if it has timed out
1583  * or if it is a negative cache hit and the mount point namecache_gen
1584  * has changed.
1585  */
1586 static __inline int
1587 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1588 {
1589         /*
1590          * Try to zap entries that have timed out.  We have
1591          * to be careful here because locked leafs may depend
1592          * on the vnode remaining intact in a parent, so only
1593          * do this under very specific conditions.
1594          */
1595         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1596             TAILQ_EMPTY(&ncp->nc_list)) {
1597                 return 1;
1598         }
1599
1600         /*
1601          * If a resolved negative cache hit is invalid due to
1602          * the mount's namecache generation being bumped, zap it.
1603          */
1604         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1605                 return 1;
1606         }
1607
1608         /*
1609          * Otherwise we are good
1610          */
1611         return 0;
1612 }
1613
1614 static __inline void
1615 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1616 {
1617         /*
1618          * Already in an unresolved state, nothing to do.
1619          */
1620         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1621                 if (_cache_auto_unresolve_test(mp, ncp))
1622                         _cache_setunresolved(ncp);
1623         }
1624 }
1625
1626 /*
1627  *
1628  */
1629 void
1630 cache_setunresolved(struct nchandle *nch)
1631 {
1632         _cache_setunresolved(nch->ncp);
1633 }
1634
1635 /*
1636  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1637  * looking for matches.  This flag tells the lookup code when it must
1638  * check for a mount linkage and also prevents the directories in question
1639  * from being deleted or renamed.
1640  */
1641 static
1642 int
1643 cache_clrmountpt_callback(struct mount *mp, void *data)
1644 {
1645         struct nchandle *nch = data;
1646
1647         if (mp->mnt_ncmounton.ncp == nch->ncp)
1648                 return(1);
1649         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1650                 return(1);
1651         return(0);
1652 }
1653
1654 /*
1655  * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1656  * with a mount point.
1657  */
1658 void
1659 cache_clrmountpt(struct nchandle *nch)
1660 {
1661         int count;
1662
1663         count = mountlist_scan(cache_clrmountpt_callback, nch,
1664                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1665         if (count == 0)
1666                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1667 }
1668
1669 /*
1670  * Invalidate portions of the namecache topology given a starting entry.
1671  * The passed ncp is set to an unresolved state and:
1672  *
1673  * The passed ncp must be referencxed and locked.  The routine may unlock
1674  * and relock ncp several times, and will recheck the children and loop
1675  * to catch races.  When done the passed ncp will be returned with the
1676  * reference and lock intact.
1677  *
1678  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1679  *                        that the physical underlying nodes have been
1680  *                        destroyed... as in deleted.  For example, when
1681  *                        a directory is removed.  This will cause record
1682  *                        lookups on the name to no longer be able to find
1683  *                        the record and tells the resolver to return failure
1684  *                        rather then trying to resolve through the parent.
1685  *
1686  *                        The topology itself, including ncp->nc_name,
1687  *                        remains intact.
1688  *
1689  *                        This only applies to the passed ncp, if CINV_CHILDREN
1690  *                        is specified the children are not flagged.
1691  *
1692  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1693  *                        state as well.
1694  *
1695  *                        Note that this will also have the side effect of
1696  *                        cleaning out any unreferenced nodes in the topology
1697  *                        from the leaves up as the recursion backs out.
1698  *
1699  * Note that the topology for any referenced nodes remains intact, but
1700  * the nodes will be marked as having been destroyed and will be set
1701  * to an unresolved state.
1702  *
1703  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1704  * the namecache entry may not actually be invalidated on return if it was
1705  * revalidated while recursing down into its children.  This code guarentees
1706  * that the node(s) will go through an invalidation cycle, but does not
1707  * guarentee that they will remain in an invalidated state.
1708  *
1709  * Returns non-zero if a revalidation was detected during the invalidation
1710  * recursion, zero otherwise.  Note that since only the original ncp is
1711  * locked the revalidation ultimately can only indicate that the original ncp
1712  * *MIGHT* no have been reresolved.
1713  *
1714  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1715  * have to avoid blowing out the kernel stack.  We do this by saving the
1716  * deep namecache node and aborting the recursion, then re-recursing at that
1717  * node using a depth-first algorithm in order to allow multiple deep
1718  * recursions to chain through each other, then we restart the invalidation
1719  * from scratch.
1720  */
1721
1722 struct cinvtrack {
1723         struct namecache *resume_ncp;
1724         int depth;
1725 };
1726
1727 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1728
1729 static
1730 int
1731 _cache_inval(struct namecache *ncp, int flags)
1732 {
1733         struct cinvtrack track;
1734         struct namecache *ncp2;
1735         int r;
1736
1737         track.depth = 0;
1738         track.resume_ncp = NULL;
1739
1740         for (;;) {
1741                 r = _cache_inval_internal(ncp, flags, &track);
1742                 if (track.resume_ncp == NULL)
1743                         break;
1744                 _cache_unlock(ncp);
1745                 while ((ncp2 = track.resume_ncp) != NULL) {
1746                         track.resume_ncp = NULL;
1747                         _cache_lock(ncp2);
1748                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1749                                              &track);
1750                         _cache_put(ncp2);
1751                 }
1752                 _cache_lock(ncp);
1753         }
1754         return(r);
1755 }
1756
1757 int
1758 cache_inval(struct nchandle *nch, int flags)
1759 {
1760         return(_cache_inval(nch->ncp, flags));
1761 }
1762
1763 /*
1764  * Helper for _cache_inval().  The passed ncp is refd and locked and
1765  * remains that way on return, but may be unlocked/relocked multiple
1766  * times by the routine.
1767  */
1768 static int
1769 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1770 {
1771         struct namecache *nextkid;
1772         int rcnt = 0;
1773
1774         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1775
1776         _cache_setunresolved(ncp);
1777         if (flags & CINV_DESTROY) {
1778                 ncp->nc_flag |= NCF_DESTROYED;
1779                 ++ncp->nc_generation;
1780         }
1781         while ((flags & CINV_CHILDREN) &&
1782                (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1783         ) {
1784                 struct namecache *kid;
1785                 int restart;
1786
1787                 restart = 0;
1788                 _cache_hold(nextkid);
1789                 if (++track->depth > MAX_RECURSION_DEPTH) {
1790                         track->resume_ncp = ncp;
1791                         _cache_hold(ncp);
1792                         ++rcnt;
1793                 }
1794                 while ((kid = nextkid) != NULL) {
1795                         /*
1796                          * Parent (ncp) must be locked for the iteration.
1797                          */
1798                         nextkid = NULL;
1799                         if (kid->nc_parent != ncp) {
1800                                 _cache_drop(kid);
1801                                 kprintf("cache_inval_internal restartA %s\n",
1802                                         ncp->nc_name);
1803                                 restart = 1;
1804                                 break;
1805                         }
1806                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1807                                 _cache_hold(nextkid);
1808
1809                         /*
1810                          * Parent unlocked for this section to avoid
1811                          * deadlocks.
1812                          */
1813                         _cache_unlock(ncp);
1814                         if (track->resume_ncp) {
1815                                 _cache_drop(kid);
1816                                 _cache_lock(ncp);
1817                                 break;
1818                         }
1819                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1820                             TAILQ_FIRST(&kid->nc_list)
1821                         ) {
1822                                 _cache_lock(kid);
1823                                 if (kid->nc_parent != ncp) {
1824                                         kprintf("cache_inval_internal "
1825                                                 "restartB %s\n",
1826                                                 ncp->nc_name);
1827                                         restart = 1;
1828                                         _cache_unlock(kid);
1829                                         _cache_drop(kid);
1830                                         _cache_lock(ncp);
1831                                         break;
1832                                 }
1833
1834                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1835                                 _cache_unlock(kid);
1836                         }
1837                         _cache_drop(kid);
1838                         _cache_lock(ncp);
1839                 }
1840                 if (nextkid)
1841                         _cache_drop(nextkid);
1842                 --track->depth;
1843                 if (restart == 0)
1844                         break;
1845         }
1846
1847         /*
1848          * Someone could have gotten in there while ncp was unlocked,
1849          * retry if so.
1850          */
1851         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1852                 ++rcnt;
1853         return (rcnt);
1854 }
1855
1856 /*
1857  * Invalidate a vnode's namecache associations.  To avoid races against
1858  * the resolver we do not invalidate a node which we previously invalidated
1859  * but which was then re-resolved while we were in the invalidation loop.
1860  *
1861  * Returns non-zero if any namecache entries remain after the invalidation
1862  * loop completed.
1863  *
1864  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1865  *       be ripped out of the topology while held, the vnode's v_namecache
1866  *       list has no such restriction.  NCP's can be ripped out of the list
1867  *       at virtually any time if not locked, even if held.
1868  *
1869  *       In addition, the v_namecache list itself must be locked via
1870  *       the vnode's spinlock.
1871  */
1872 int
1873 cache_inval_vp(struct vnode *vp, int flags)
1874 {
1875         struct namecache *ncp;
1876         struct namecache *next;
1877
1878 restart:
1879         spin_lock(&vp->v_spin);
1880         ncp = TAILQ_FIRST(&vp->v_namecache);
1881         if (ncp)
1882                 _cache_hold(ncp);
1883         while (ncp) {
1884                 /* loop entered with ncp held and vp spin-locked */
1885                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1886                         _cache_hold(next);
1887                 spin_unlock(&vp->v_spin);
1888                 _cache_lock(ncp);
1889                 if (ncp->nc_vp != vp) {
1890                         kprintf("Warning: cache_inval_vp: race-A detected on "
1891                                 "%s\n", ncp->nc_name);
1892                         _cache_put(ncp);
1893                         if (next)
1894                                 _cache_drop(next);
1895                         goto restart;
1896                 }
1897                 _cache_inval(ncp, flags);
1898                 _cache_put(ncp);                /* also releases reference */
1899                 ncp = next;
1900                 spin_lock(&vp->v_spin);
1901                 if (ncp && ncp->nc_vp != vp) {
1902                         spin_unlock(&vp->v_spin);
1903                         kprintf("Warning: cache_inval_vp: race-B detected on "
1904                                 "%s\n", ncp->nc_name);
1905                         _cache_drop(ncp);
1906                         goto restart;
1907                 }
1908         }
1909         spin_unlock(&vp->v_spin);
1910         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1911 }
1912
1913 /*
1914  * This routine is used instead of the normal cache_inval_vp() when we
1915  * are trying to recycle otherwise good vnodes.
1916  *
1917  * Return 0 on success, non-zero if not all namecache records could be
1918  * disassociated from the vnode (for various reasons).
1919  */
1920 int
1921 cache_inval_vp_nonblock(struct vnode *vp)
1922 {
1923         struct namecache *ncp;
1924         struct namecache *next;
1925
1926         spin_lock(&vp->v_spin);
1927         ncp = TAILQ_FIRST(&vp->v_namecache);
1928         if (ncp)
1929                 _cache_hold(ncp);
1930         while (ncp) {
1931                 /* loop entered with ncp held */
1932                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1933                         _cache_hold(next);
1934                 spin_unlock(&vp->v_spin);
1935                 if (_cache_lock_nonblock(ncp)) {
1936                         _cache_drop(ncp);
1937                         if (next)
1938                                 _cache_drop(next);
1939                         goto done;
1940                 }
1941                 if (ncp->nc_vp != vp) {
1942                         kprintf("Warning: cache_inval_vp: race-A detected on "
1943                                 "%s\n", ncp->nc_name);
1944                         _cache_put(ncp);
1945                         if (next)
1946                                 _cache_drop(next);
1947                         goto done;
1948                 }
1949                 _cache_inval(ncp, 0);
1950                 _cache_put(ncp);                /* also releases reference */
1951                 ncp = next;
1952                 spin_lock(&vp->v_spin);
1953                 if (ncp && ncp->nc_vp != vp) {
1954                         spin_unlock(&vp->v_spin);
1955                         kprintf("Warning: cache_inval_vp: race-B detected on "
1956                                 "%s\n", ncp->nc_name);
1957                         _cache_drop(ncp);
1958                         goto done;
1959                 }
1960         }
1961         spin_unlock(&vp->v_spin);
1962 done:
1963         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1964 }
1965
1966 /*
1967  * Clears the universal directory search 'ok' flag.  This flag allows
1968  * nlookup() to bypass normal vnode checks.  This flag is a cached flag
1969  * so clearing it simply forces revalidation.
1970  */
1971 void
1972 cache_inval_wxok(struct vnode *vp)
1973 {
1974         struct namecache *ncp;
1975
1976         spin_lock(&vp->v_spin);
1977         TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1978                 if (ncp->nc_flag & NCF_WXOK)
1979                         atomic_clear_short(&ncp->nc_flag, NCF_WXOK);
1980         }
1981         spin_unlock(&vp->v_spin);
1982 }
1983
1984 /*
1985  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1986  * must be locked.  The target ncp is destroyed (as a normal rename-over
1987  * would destroy the target file or directory).
1988  *
1989  * Because there may be references to the source ncp we cannot copy its
1990  * contents to the target.  Instead the source ncp is relinked as the target
1991  * and the target ncp is removed from the namecache topology.
1992  */
1993 void
1994 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1995 {
1996         struct namecache *fncp = fnch->ncp;
1997         struct namecache *tncp = tnch->ncp;
1998         struct namecache *tncp_par;
1999         struct nchash_head *nchpp;
2000         u_int32_t hash;
2001         char *oname;
2002         char *nname;
2003
2004         ++fncp->nc_generation;
2005         ++tncp->nc_generation;
2006         if (tncp->nc_nlen) {
2007                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
2008                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
2009                 nname[tncp->nc_nlen] = 0;
2010         } else {
2011                 nname = NULL;
2012         }
2013
2014         /*
2015          * Rename fncp (unlink)
2016          */
2017         _cache_unlink_parent(fncp);
2018         oname = fncp->nc_name;
2019         fncp->nc_name = nname;
2020         fncp->nc_nlen = tncp->nc_nlen;
2021         if (oname)
2022                 kfree(oname, M_VFSCACHE);
2023
2024         tncp_par = tncp->nc_parent;
2025         _cache_hold(tncp_par);
2026         _cache_lock(tncp_par);
2027
2028         /*
2029          * Rename fncp (relink)
2030          */
2031         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
2032         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
2033         nchpp = NCHHASH(hash);
2034
2035         spin_lock(&nchpp->spin);
2036         _cache_link_parent(fncp, tncp_par, nchpp);
2037         spin_unlock(&nchpp->spin);
2038
2039         _cache_put(tncp_par);
2040
2041         /*
2042          * Get rid of the overwritten tncp (unlink)
2043          */
2044         _cache_unlink(tncp);
2045 }
2046
2047 /*
2048  * Perform actions consistent with unlinking a file.  The passed-in ncp
2049  * must be locked.
2050  *
2051  * The ncp is marked DESTROYED so it no longer shows up in searches,
2052  * and will be physically deleted when the vnode goes away.
2053  *
2054  * If the related vnode has no refs then we cycle it through vget()/vput()
2055  * to (possibly if we don't have a ref race) trigger a deactivation,
2056  * allowing the VFS to trivially detect and recycle the deleted vnode
2057  * via VOP_INACTIVE().
2058  *
2059  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
2060  *       target ncp.
2061  */
2062 void
2063 cache_unlink(struct nchandle *nch)
2064 {
2065         _cache_unlink(nch->ncp);
2066 }
2067
2068 static void
2069 _cache_unlink(struct namecache *ncp)
2070 {
2071         struct vnode *vp;
2072
2073         /*
2074          * Causes lookups to fail and allows another ncp with the same
2075          * name to be created under ncp->nc_parent.
2076          */
2077         ncp->nc_flag |= NCF_DESTROYED;
2078         ++ncp->nc_generation;
2079
2080         /*
2081          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
2082          * force action on the 1->0 transition.
2083          */
2084         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2085             (vp = ncp->nc_vp) != NULL) {
2086                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
2087                 if (VREFCNT(vp) <= 0) {
2088                         if (vget(vp, LK_SHARED) == 0)
2089                                 vput(vp);
2090                 }
2091         }
2092 }
2093
2094 /*
2095  * Return non-zero if the nch might be associated with an open and/or mmap()'d
2096  * file.  The easy solution is to just return non-zero if the vnode has refs.
2097  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
2098  * force the reclaim).
2099  */
2100 int
2101 cache_isopen(struct nchandle *nch)
2102 {
2103         struct vnode *vp;
2104         struct namecache *ncp = nch->ncp;
2105
2106         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2107             (vp = ncp->nc_vp) != NULL &&
2108             VREFCNT(vp)) {
2109                 return 1;
2110         }
2111         return 0;
2112 }
2113
2114
2115 /*
2116  * vget the vnode associated with the namecache entry.  Resolve the namecache
2117  * entry if necessary.  The passed ncp must be referenced and locked.  If
2118  * the ncp is resolved it might be locked shared.
2119  *
2120  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
2121  * (depending on the passed lk_type) will be returned in *vpp with an error
2122  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
2123  * most typical error is ENOENT, meaning that the ncp represents a negative
2124  * cache hit and there is no vnode to retrieve, but other errors can occur
2125  * too.
2126  *
2127  * The vget() can race a reclaim.  If this occurs we re-resolve the
2128  * namecache entry.
2129  *
2130  * There are numerous places in the kernel where vget() is called on a
2131  * vnode while one or more of its namecache entries is locked.  Releasing
2132  * a vnode never deadlocks against locked namecache entries (the vnode
2133  * will not get recycled while referenced ncp's exist).  This means we
2134  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
2135  * lock when acquiring the vp lock or we might cause a deadlock.
2136  *
2137  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2138  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2139  *       relocked exclusively before being re-resolved.
2140  */
2141 int
2142 cache_vget(struct nchandle *nch, struct ucred *cred,
2143            int lk_type, struct vnode **vpp)
2144 {
2145         struct namecache *ncp;
2146         struct vnode *vp;
2147         int error;
2148
2149         ncp = nch->ncp;
2150 again:
2151         vp = NULL;
2152         if (ncp->nc_flag & NCF_UNRESOLVED)
2153                 error = cache_resolve(nch, cred);
2154         else
2155                 error = 0;
2156
2157         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2158                 error = vget(vp, lk_type);
2159                 if (error) {
2160                         /*
2161                          * VRECLAIM race
2162                          *
2163                          * The ncp may have been locked shared, we must relock
2164                          * it exclusively before we can set it to unresolved.
2165                          */
2166                         if (error == ENOENT) {
2167                                 kprintf("Warning: vnode reclaim race detected "
2168                                         "in cache_vget on %p (%s)\n",
2169                                         vp, ncp->nc_name);
2170                                 _cache_unlock(ncp);
2171                                 _cache_lock(ncp);
2172                                 _cache_setunresolved(ncp);
2173                                 goto again;
2174                         }
2175
2176                         /*
2177                          * Not a reclaim race, some other error.
2178                          */
2179                         KKASSERT(ncp->nc_vp == vp);
2180                         vp = NULL;
2181                 } else {
2182                         KKASSERT(ncp->nc_vp == vp);
2183                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2184                 }
2185         }
2186         if (error == 0 && vp == NULL)
2187                 error = ENOENT;
2188         *vpp = vp;
2189         return(error);
2190 }
2191
2192 /*
2193  * Similar to cache_vget() but only acquires a ref on the vnode.
2194  *
2195  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2196  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2197  *       relocked exclusively before being re-resolved.
2198  */
2199 int
2200 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
2201 {
2202         struct namecache *ncp;
2203         struct vnode *vp;
2204         int error;
2205
2206         ncp = nch->ncp;
2207 again:
2208         vp = NULL;
2209         if (ncp->nc_flag & NCF_UNRESOLVED)
2210                 error = cache_resolve(nch, cred);
2211         else
2212                 error = 0;
2213
2214         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2215                 error = vget(vp, LK_SHARED);
2216                 if (error) {
2217                         /*
2218                          * VRECLAIM race
2219                          */
2220                         if (error == ENOENT) {
2221                                 kprintf("Warning: vnode reclaim race detected "
2222                                         "in cache_vget on %p (%s)\n",
2223                                         vp, ncp->nc_name);
2224                                 _cache_unlock(ncp);
2225                                 _cache_lock(ncp);
2226                                 _cache_setunresolved(ncp);
2227                                 goto again;
2228                         }
2229
2230                         /*
2231                          * Not a reclaim race, some other error.
2232                          */
2233                         KKASSERT(ncp->nc_vp == vp);
2234                         vp = NULL;
2235                 } else {
2236                         KKASSERT(ncp->nc_vp == vp);
2237                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2238                         /* caller does not want a lock */
2239                         vn_unlock(vp);
2240                 }
2241         }
2242         if (error == 0 && vp == NULL)
2243                 error = ENOENT;
2244         *vpp = vp;
2245         return(error);
2246 }
2247
2248 /*
2249  * Return a referenced vnode representing the parent directory of
2250  * ncp.
2251  *
2252  * Because the caller has locked the ncp it should not be possible for
2253  * the parent ncp to go away.  However, the parent can unresolve its
2254  * dvp at any time so we must be able to acquire a lock on the parent
2255  * to safely access nc_vp.
2256  *
2257  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2258  * so use vhold()/vdrop() while holding the lock to prevent dvp from
2259  * getting destroyed.
2260  *
2261  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2262  *       lock on the ncp in question..
2263  */
2264 static struct vnode *
2265 cache_dvpref(struct namecache *ncp)
2266 {
2267         struct namecache *par;
2268         struct vnode *dvp;
2269
2270         dvp = NULL;
2271         if ((par = ncp->nc_parent) != NULL) {
2272                 _cache_hold(par);
2273                 _cache_lock(par);
2274                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2275                         if ((dvp = par->nc_vp) != NULL)
2276                                 vhold(dvp);
2277                 }
2278                 _cache_unlock(par);
2279                 if (dvp) {
2280                         if (vget(dvp, LK_SHARED) == 0) {
2281                                 vn_unlock(dvp);
2282                                 vdrop(dvp);
2283                                 /* return refd, unlocked dvp */
2284                         } else {
2285                                 vdrop(dvp);
2286                                 dvp = NULL;
2287                         }
2288                 }
2289                 _cache_drop(par);
2290         }
2291         return(dvp);
2292 }
2293
2294 /*
2295  * Convert a directory vnode to a namecache record without any other
2296  * knowledge of the topology.  This ONLY works with directory vnodes and
2297  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2298  * returned ncp (if not NULL) will be held and unlocked.
2299  *
2300  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2301  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2302  * for dvp.  This will fail only if the directory has been deleted out from
2303  * under the caller.
2304  *
2305  * Callers must always check for a NULL return no matter the value of 'makeit'.
2306  *
2307  * To avoid underflowing the kernel stack each recursive call increments
2308  * the makeit variable.
2309  */
2310
2311 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2312                                   struct vnode *dvp, char *fakename);
2313 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2314                                   struct vnode **saved_dvp);
2315
2316 int
2317 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2318               struct nchandle *nch)
2319 {
2320         struct vnode *saved_dvp;
2321         struct vnode *pvp;
2322         char *fakename;
2323         int error;
2324
2325         nch->ncp = NULL;
2326         nch->mount = dvp->v_mount;
2327         saved_dvp = NULL;
2328         fakename = NULL;
2329
2330         /*
2331          * Handle the makeit == 0 degenerate case
2332          */
2333         if (makeit == 0) {
2334                 spin_lock_shared(&dvp->v_spin);
2335                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2336                 if (nch->ncp)
2337                         cache_hold(nch);
2338                 spin_unlock_shared(&dvp->v_spin);
2339         }
2340
2341         /*
2342          * Loop until resolution, inside code will break out on error.
2343          */
2344         while (makeit) {
2345                 /*
2346                  * Break out if we successfully acquire a working ncp.
2347                  */
2348                 spin_lock_shared(&dvp->v_spin);
2349                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2350                 if (nch->ncp) {
2351                         cache_hold(nch);
2352                         spin_unlock_shared(&dvp->v_spin);
2353                         break;
2354                 }
2355                 spin_unlock_shared(&dvp->v_spin);
2356
2357                 /*
2358                  * If dvp is the root of its filesystem it should already
2359                  * have a namecache pointer associated with it as a side
2360                  * effect of the mount, but it may have been disassociated.
2361                  */
2362                 if (dvp->v_flag & VROOT) {
2363                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2364                         error = cache_resolve_mp(nch->mount);
2365                         _cache_put(nch->ncp);
2366                         if (ncvp_debug) {
2367                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2368                                         dvp->v_mount, error);
2369                         }
2370                         if (error) {
2371                                 if (ncvp_debug)
2372                                         kprintf(" failed\n");
2373                                 nch->ncp = NULL;
2374                                 break;
2375                         }
2376                         if (ncvp_debug)
2377                                 kprintf(" succeeded\n");
2378                         continue;
2379                 }
2380
2381                 /*
2382                  * If we are recursed too deeply resort to an O(n^2)
2383                  * algorithm to resolve the namecache topology.  The
2384                  * resolved pvp is left referenced in saved_dvp to
2385                  * prevent the tree from being destroyed while we loop.
2386                  */
2387                 if (makeit > 20) {
2388                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2389                         if (error) {
2390                                 kprintf("lookupdotdot(longpath) failed %d "
2391                                        "dvp %p\n", error, dvp);
2392                                 nch->ncp = NULL;
2393                                 break;
2394                         }
2395                         continue;
2396                 }
2397
2398                 /*
2399                  * Get the parent directory and resolve its ncp.
2400                  */
2401                 if (fakename) {
2402                         kfree(fakename, M_TEMP);
2403                         fakename = NULL;
2404                 }
2405                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2406                                           &fakename);
2407                 if (error) {
2408                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2409                         break;
2410                 }
2411                 vn_unlock(pvp);
2412
2413                 /*
2414                  * Reuse makeit as a recursion depth counter.  On success
2415                  * nch will be fully referenced.
2416                  */
2417                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2418                 vrele(pvp);
2419                 if (nch->ncp == NULL)
2420                         break;
2421
2422                 /*
2423                  * Do an inefficient scan of pvp (embodied by ncp) to look
2424                  * for dvp.  This will create a namecache record for dvp on
2425                  * success.  We loop up to recheck on success.
2426                  *
2427                  * ncp and dvp are both held but not locked.
2428                  */
2429                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2430                 if (error) {
2431                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2432                                 pvp, nch->ncp->nc_name, dvp);
2433                         cache_drop(nch);
2434                         /* nch was NULLed out, reload mount */
2435                         nch->mount = dvp->v_mount;
2436                         break;
2437                 }
2438                 if (ncvp_debug) {
2439                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2440                                 pvp, nch->ncp->nc_name);
2441                 }
2442                 cache_drop(nch);
2443                 /* nch was NULLed out, reload mount */
2444                 nch->mount = dvp->v_mount;
2445         }
2446
2447         /*
2448          * If nch->ncp is non-NULL it will have been held already.
2449          */
2450         if (fakename)
2451                 kfree(fakename, M_TEMP);
2452         if (saved_dvp)
2453                 vrele(saved_dvp);
2454         if (nch->ncp)
2455                 return (0);
2456         return (EINVAL);
2457 }
2458
2459 /*
2460  * Go up the chain of parent directories until we find something
2461  * we can resolve into the namecache.  This is very inefficient.
2462  */
2463 static
2464 int
2465 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2466                   struct vnode **saved_dvp)
2467 {
2468         struct nchandle nch;
2469         struct vnode *pvp;
2470         int error;
2471         static time_t last_fromdvp_report;
2472         char *fakename;
2473
2474         /*
2475          * Loop getting the parent directory vnode until we get something we
2476          * can resolve in the namecache.
2477          */
2478         vref(dvp);
2479         nch.mount = dvp->v_mount;
2480         nch.ncp = NULL;
2481         fakename = NULL;
2482
2483         for (;;) {
2484                 if (fakename) {
2485                         kfree(fakename, M_TEMP);
2486                         fakename = NULL;
2487                 }
2488                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2489                                           &fakename);
2490                 if (error) {
2491                         vrele(dvp);
2492                         break;
2493                 }
2494                 vn_unlock(pvp);
2495                 spin_lock_shared(&pvp->v_spin);
2496                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2497                         _cache_hold(nch.ncp);
2498                         spin_unlock_shared(&pvp->v_spin);
2499                         vrele(pvp);
2500                         break;
2501                 }
2502                 spin_unlock_shared(&pvp->v_spin);
2503                 if (pvp->v_flag & VROOT) {
2504                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2505                         error = cache_resolve_mp(nch.mount);
2506                         _cache_unlock(nch.ncp);
2507                         vrele(pvp);
2508                         if (error) {
2509                                 _cache_drop(nch.ncp);
2510                                 nch.ncp = NULL;
2511                                 vrele(dvp);
2512                         }
2513                         break;
2514                 }
2515                 vrele(dvp);
2516                 dvp = pvp;
2517         }
2518         if (error == 0) {
2519                 if (last_fromdvp_report != time_uptime) {
2520                         last_fromdvp_report = time_uptime;
2521                         kprintf("Warning: extremely inefficient path "
2522                                 "resolution on %s\n",
2523                                 nch.ncp->nc_name);
2524                 }
2525                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2526
2527                 /*
2528                  * Hopefully dvp now has a namecache record associated with
2529                  * it.  Leave it referenced to prevent the kernel from
2530                  * recycling the vnode.  Otherwise extremely long directory
2531                  * paths could result in endless recycling.
2532                  */
2533                 if (*saved_dvp)
2534                     vrele(*saved_dvp);
2535                 *saved_dvp = dvp;
2536                 _cache_drop(nch.ncp);
2537         }
2538         if (fakename)
2539                 kfree(fakename, M_TEMP);
2540         return (error);
2541 }
2542
2543 /*
2544  * Do an inefficient scan of the directory represented by ncp looking for
2545  * the directory vnode dvp.  ncp must be held but not locked on entry and
2546  * will be held on return.  dvp must be refd but not locked on entry and
2547  * will remain refd on return.
2548  *
2549  * Why do this at all?  Well, due to its stateless nature the NFS server
2550  * converts file handles directly to vnodes without necessarily going through
2551  * the namecache ops that would otherwise create the namecache topology
2552  * leading to the vnode.  We could either (1) Change the namecache algorithms
2553  * to allow disconnect namecache records that are re-merged opportunistically,
2554  * or (2) Make the NFS server backtrack and scan to recover a connected
2555  * namecache topology in order to then be able to issue new API lookups.
2556  *
2557  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2558  * namecache algorithms and introduces a lot of complication in every subsystem
2559  * that calls into the namecache to deal with the re-merge case, especially
2560  * since we are using the namecache to placehold negative lookups and the
2561  * vnode might not be immediately assigned. (2) is certainly far less
2562  * efficient then (1), but since we are only talking about directories here
2563  * (which are likely to remain cached), the case does not actually run all
2564  * that often and has the supreme advantage of not polluting the namecache
2565  * algorithms.
2566  *
2567  * If a fakename is supplied just construct a namecache entry using the
2568  * fake name.
2569  */
2570 static int
2571 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2572                        struct vnode *dvp, char *fakename)
2573 {
2574         struct nlcomponent nlc;
2575         struct nchandle rncp;
2576         struct dirent *den;
2577         struct vnode *pvp;
2578         struct vattr vat;
2579         struct iovec iov;
2580         struct uio uio;
2581         int blksize;
2582         int eofflag;
2583         int bytes;
2584         char *rbuf;
2585         int error;
2586
2587         vat.va_blocksize = 0;
2588         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2589                 return (error);
2590         cache_lock(nch);
2591         error = cache_vref(nch, cred, &pvp);
2592         cache_unlock(nch);
2593         if (error)
2594                 return (error);
2595         if (ncvp_debug) {
2596                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2597                         "vattr fileid = %lld\n",
2598                         nch->ncp, nch->ncp->nc_name,
2599                         vat.va_blocksize,
2600                         (long long)vat.va_fileid);
2601         }
2602
2603         /*
2604          * Use the supplied fakename if not NULL.  Fake names are typically
2605          * not in the actual filesystem hierarchy.  This is used by HAMMER
2606          * to glue @@timestamp recursions together.
2607          */
2608         if (fakename) {
2609                 nlc.nlc_nameptr = fakename;
2610                 nlc.nlc_namelen = strlen(fakename);
2611                 rncp = cache_nlookup(nch, &nlc);
2612                 goto done;
2613         }
2614
2615         if ((blksize = vat.va_blocksize) == 0)
2616                 blksize = DEV_BSIZE;
2617         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2618         rncp.ncp = NULL;
2619
2620         eofflag = 0;
2621         uio.uio_offset = 0;
2622 again:
2623         iov.iov_base = rbuf;
2624         iov.iov_len = blksize;
2625         uio.uio_iov = &iov;
2626         uio.uio_iovcnt = 1;
2627         uio.uio_resid = blksize;
2628         uio.uio_segflg = UIO_SYSSPACE;
2629         uio.uio_rw = UIO_READ;
2630         uio.uio_td = curthread;
2631
2632         if (ncvp_debug >= 2)
2633                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2634         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2635         if (error == 0) {
2636                 den = (struct dirent *)rbuf;
2637                 bytes = blksize - uio.uio_resid;
2638
2639                 while (bytes > 0) {
2640                         if (ncvp_debug >= 2) {
2641                                 kprintf("cache_inefficient_scan: %*.*s\n",
2642                                         den->d_namlen, den->d_namlen,
2643                                         den->d_name);
2644                         }
2645                         if (den->d_type != DT_WHT &&
2646                             den->d_ino == vat.va_fileid) {
2647                                 if (ncvp_debug) {
2648                                         kprintf("cache_inefficient_scan: "
2649                                                "MATCHED inode %lld path %s/%*.*s\n",
2650                                                (long long)vat.va_fileid,
2651                                                nch->ncp->nc_name,
2652                                                den->d_namlen, den->d_namlen,
2653                                                den->d_name);
2654                                 }
2655                                 nlc.nlc_nameptr = den->d_name;
2656                                 nlc.nlc_namelen = den->d_namlen;
2657                                 rncp = cache_nlookup(nch, &nlc);
2658                                 KKASSERT(rncp.ncp != NULL);
2659                                 break;
2660                         }
2661                         bytes -= _DIRENT_DIRSIZ(den);
2662                         den = _DIRENT_NEXT(den);
2663                 }
2664                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2665                         goto again;
2666         }
2667         kfree(rbuf, M_TEMP);
2668 done:
2669         vrele(pvp);
2670         if (rncp.ncp) {
2671                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2672                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2673                         if (ncvp_debug >= 2) {
2674                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2675                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2676                         }
2677                 } else {
2678                         if (ncvp_debug >= 2) {
2679                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2680                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2681                                         rncp.ncp->nc_vp);
2682                         }
2683                 }
2684                 if (rncp.ncp->nc_vp == NULL)
2685                         error = rncp.ncp->nc_error;
2686                 /*
2687                  * Release rncp after a successful nlookup.  rncp was fully
2688                  * referenced.
2689                  */
2690                 cache_put(&rncp);
2691         } else {
2692                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2693                         dvp, nch->ncp->nc_name);
2694                 error = ENOENT;
2695         }
2696         return (error);
2697 }
2698
2699 /*
2700  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2701  * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list.
2702  *
2703  * Then, if there are no additional references to the ncp and no children,
2704  * the ncp is removed from the topology and destroyed.
2705  *
2706  * References and/or children may exist if the ncp is in the middle of the
2707  * topology, preventing the ncp from being destroyed.
2708  *
2709  * This function must be called with the ncp held and locked and will unlock
2710  * and drop it during zapping.
2711  *
2712  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2713  * This case can occur in the cache_drop() path.
2714  *
2715  * This function may returned a held (but NOT locked) parent node which the
2716  * caller must drop.  We do this so _cache_drop() can loop, to avoid
2717  * blowing out the kernel stack.
2718  *
2719  * WARNING!  For MPSAFE operation this routine must acquire up to three
2720  *           spin locks to be able to safely test nc_refs.  Lock order is
2721  *           very important.
2722  *
2723  *           hash spinlock if on hash list
2724  *           parent spinlock if child of parent
2725  *           (the ncp is unresolved so there is no vnode association)
2726  */
2727 static struct namecache *
2728 cache_zap(struct namecache *ncp, int nonblock)
2729 {
2730         struct namecache *par;
2731         struct vnode *dropvp;
2732         struct nchash_head *nchpp;
2733         int refs;
2734
2735         /*
2736          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2737          */
2738         _cache_setunresolved(ncp);
2739
2740         /*
2741          * Try to scrap the entry and possibly tail-recurse on its parent.
2742          * We only scrap unref'd (other then our ref) unresolved entries,
2743          * we do not scrap 'live' entries.
2744          *
2745          * Note that once the spinlocks are acquired if nc_refs == 1 no
2746          * other references are possible.  If it isn't, however, we have
2747          * to decrement but also be sure to avoid a 1->0 transition.
2748          */
2749         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2750         KKASSERT(ncp->nc_refs > 0);
2751
2752         /*
2753          * Acquire locks.  Note that the parent can't go away while we hold
2754          * a child locked.
2755          */
2756         nchpp = NULL;
2757         if ((par = ncp->nc_parent) != NULL) {
2758                 if (nonblock) {
2759                         for (;;) {
2760                                 if (_cache_lock_nonblock(par) == 0)
2761                                         break;
2762                                 refs = ncp->nc_refs;
2763                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2764                                 ++numdefered;   /* MP race ok */
2765                                 if (atomic_cmpset_int(&ncp->nc_refs,
2766                                                       refs, refs - 1)) {
2767                                         _cache_unlock(ncp);
2768                                         return(NULL);
2769                                 }
2770                                 cpu_pause();
2771                         }
2772                         _cache_hold(par);
2773                 } else {
2774                         _cache_hold(par);
2775                         _cache_lock(par);
2776                 }
2777                 nchpp = ncp->nc_head;
2778                 spin_lock(&nchpp->spin);
2779         }
2780
2781         /*
2782          * At this point if we find refs == 1 it should not be possible for
2783          * anyone else to have access to the ncp.  We are holding the only
2784          * possible access point left (nchpp) spin-locked.
2785          *
2786          * If someone other then us has a ref or we have children
2787          * we cannot zap the entry.  The 1->0 transition and any
2788          * further list operation is protected by the spinlocks
2789          * we have acquired but other transitions are not.
2790          */
2791         for (;;) {
2792                 refs = ncp->nc_refs;
2793                 cpu_ccfence();
2794                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2795                         break;
2796                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2797                         if (par) {
2798                                 spin_unlock(&nchpp->spin);
2799                                 _cache_put(par);
2800                         }
2801                         _cache_unlock(ncp);
2802                         return(NULL);
2803                 }
2804                 cpu_pause();
2805         }
2806
2807         /*
2808          * We are the only ref and with the spinlocks held no further
2809          * refs can be acquired by others.
2810          *
2811          * Remove us from the hash list and parent list.  We have to
2812          * drop a ref on the parent's vp if the parent's list becomes
2813          * empty.
2814          */
2815         dropvp = NULL;
2816         if (par) {
2817                 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
2818
2819                 KKASSERT(nchpp == ncp->nc_head);
2820                 LIST_REMOVE(ncp, nc_hash);
2821                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2822                 atomic_add_long(&pn->vfscache_count, -1);
2823                 if (TAILQ_EMPTY(&ncp->nc_list))
2824                         atomic_add_long(&pn->vfscache_leafs, -1);
2825
2826                 if (TAILQ_EMPTY(&par->nc_list)) {
2827                         atomic_add_long(&pn->vfscache_leafs, 1);
2828                         if (par->nc_vp)
2829                                 dropvp = par->nc_vp;
2830                 }
2831                 ncp->nc_head = NULL;
2832                 ncp->nc_parent = NULL;
2833                 spin_unlock(&nchpp->spin);
2834                 _cache_unlock(par);
2835         } else {
2836                 KKASSERT(ncp->nc_head == NULL);
2837         }
2838
2839         /*
2840          * ncp should not have picked up any refs.  Physically
2841          * destroy the ncp.
2842          */
2843         if (ncp->nc_refs != 1) {
2844                 int save_refs = ncp->nc_refs;
2845                 cpu_ccfence();
2846                 panic("cache_zap: %p bad refs %d (%d)\n",
2847                         ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0));
2848         }
2849         KKASSERT(ncp->nc_refs == 1);
2850         /* _cache_unlock(ncp) not required */
2851         ncp->nc_refs = -1;      /* safety */
2852         if (ncp->nc_name)
2853                 kfree(ncp->nc_name, M_VFSCACHE);
2854         kfree(ncp, M_VFSCACHE);
2855
2856         /*
2857          * Delayed drop (we had to release our spinlocks)
2858          *
2859          * The refed parent (if not  NULL) must be dropped.  The
2860          * caller is responsible for looping.
2861          */
2862         if (dropvp)
2863                 vdrop(dropvp);
2864         return(par);
2865 }
2866
2867 /*
2868  * Clean up dangling negative cache and defered-drop entries in the
2869  * namecache.
2870  *
2871  * This routine is called in the critical path and also called from
2872  * vnlru().  When called from vnlru we use a lower limit to try to
2873  * deal with the negative cache before the critical path has to start
2874  * dealing with it.
2875  */
2876 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2877
2878 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2879 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2880
2881 void
2882 cache_hysteresis(int critpath)
2883 {
2884         long poslimit;
2885         long neglimit = maxvnodes / ncnegfactor;
2886         long xnumcache = vfscache_leafs;
2887
2888         if (critpath == 0)
2889                 neglimit = neglimit * 8 / 10;
2890
2891         /*
2892          * Don't cache too many negative hits.  We use hysteresis to reduce
2893          * the impact on the critical path.
2894          */
2895         switch(neg_cache_hysteresis_state[critpath]) {
2896         case CHI_LOW:
2897                 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
2898                         if (critpath)
2899                                 _cache_cleanneg(ncnegflush);
2900                         else
2901                                 _cache_cleanneg(ncnegflush +
2902                                                 vfscache_negs - neglimit);
2903                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2904                 }
2905                 break;
2906         case CHI_HIGH:
2907                 if (vfscache_negs > MINNEG * 9 / 10 &&
2908                     vfscache_negs * 9 / 10 > neglimit
2909                 ) {
2910                         if (critpath)
2911                                 _cache_cleanneg(ncnegflush);
2912                         else
2913                                 _cache_cleanneg(ncnegflush +
2914                                                 vfscache_negs * 9 / 10 -
2915                                                 neglimit);
2916                 } else {
2917                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
2918                 }
2919                 break;
2920         }
2921
2922         /*
2923          * Don't cache too many positive hits.  We use hysteresis to reduce
2924          * the impact on the critical path.
2925          *
2926          * Excessive positive hits can accumulate due to large numbers of
2927          * hardlinks (the vnode cache will not prevent hl ncps from growing
2928          * into infinity).
2929          */
2930         if ((poslimit = ncposlimit) == 0)
2931                 poslimit = maxvnodes * 2;
2932         if (critpath == 0)
2933                 poslimit = poslimit * 8 / 10;
2934
2935         switch(pos_cache_hysteresis_state[critpath]) {
2936         case CHI_LOW:
2937                 if (xnumcache > poslimit && xnumcache > MINPOS) {
2938                         if (critpath)
2939                                 _cache_cleanpos(ncposflush);
2940                         else
2941                                 _cache_cleanpos(ncposflush +
2942                                                 xnumcache - poslimit);
2943                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2944                 }
2945                 break;
2946         case CHI_HIGH:
2947                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2948                         if (critpath)
2949                                 _cache_cleanpos(ncposflush);
2950                         else
2951                                 _cache_cleanpos(ncposflush +
2952                                                 xnumcache - poslimit * 5 / 6);
2953                 } else {
2954                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
2955                 }
2956                 break;
2957         }
2958
2959         /*
2960          * Clean out dangling defered-zap ncps which could not
2961          * be cleanly dropped if too many build up.  Note
2962          * that numdefered is not an exact number as such ncps
2963          * can be reused and the counter is not handled in a MP
2964          * safe manner by design.
2965          */
2966         if (numdefered > neglimit) {
2967                 _cache_cleandefered();
2968         }
2969 }
2970
2971 /*
2972  * NEW NAMECACHE LOOKUP API
2973  *
2974  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2975  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2976  * is ALWAYS returned, eve if the supplied component is illegal.
2977  *
2978  * The resulting namecache entry should be returned to the system with
2979  * cache_put() or cache_unlock() + cache_drop().
2980  *
2981  * namecache locks are recursive but care must be taken to avoid lock order
2982  * reversals (hence why the passed par_nch must be unlocked).  Locking
2983  * rules are to order for parent traversals, not for child traversals.
2984  *
2985  * Nobody else will be able to manipulate the associated namespace (e.g.
2986  * create, delete, rename, rename-target) until the caller unlocks the
2987  * entry.
2988  *
2989  * The returned entry will be in one of three states:  positive hit (non-null
2990  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2991  * Unresolved entries must be resolved through the filesystem to associate the
2992  * vnode and/or determine whether a positive or negative hit has occured.
2993  *
2994  * It is not necessary to lock a directory in order to lock namespace under
2995  * that directory.  In fact, it is explicitly not allowed to do that.  A
2996  * directory is typically only locked when being created, renamed, or
2997  * destroyed.
2998  *
2999  * The directory (par) may be unresolved, in which case any returned child
3000  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
3001  * the filesystem lookup requires a resolved directory vnode the caller is
3002  * responsible for resolving the namecache chain top-down.  This API
3003  * specifically allows whole chains to be created in an unresolved state.
3004  */
3005 struct nchandle
3006 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
3007 {
3008         struct nchandle nch;
3009         struct namecache *ncp;
3010         struct namecache *new_ncp;
3011         struct nchash_head *nchpp;
3012         struct mount *mp;
3013         u_int32_t hash;
3014         globaldata_t gd;
3015         int par_locked;
3016
3017         gd = mycpu;
3018         mp = par_nch->mount;
3019         par_locked = 0;
3020
3021         /*
3022          * This is a good time to call it, no ncp's are locked by
3023          * the caller or us.
3024          */
3025         cache_hysteresis(1);
3026
3027         /*
3028          * Try to locate an existing entry
3029          */
3030         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3031         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3032         new_ncp = NULL;
3033         nchpp = NCHHASH(hash);
3034 restart:
3035         if (new_ncp)
3036                 spin_lock(&nchpp->spin);
3037         else
3038                 spin_lock_shared(&nchpp->spin);
3039
3040         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
3041                 /*
3042                  * Break out if we find a matching entry.  Note that
3043                  * UNRESOLVED entries may match, but DESTROYED entries
3044                  * do not.
3045                  */
3046                 if (ncp->nc_parent == par_nch->ncp &&
3047                     ncp->nc_nlen == nlc->nlc_namelen &&
3048                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3049                     (ncp->nc_flag & NCF_DESTROYED) == 0
3050                 ) {
3051                         _cache_hold(ncp);
3052                         if (new_ncp)
3053                                 spin_unlock(&nchpp->spin);
3054                         else
3055                                 spin_unlock_shared(&nchpp->spin);
3056                         if (par_locked) {
3057                                 _cache_unlock(par_nch->ncp);
3058                                 par_locked = 0;
3059                         }
3060                         if (_cache_lock_special(ncp) == 0) {
3061                                 /*
3062                                  * Successfully locked but we must re-test
3063                                  * conditions that might have changed since
3064                                  * we did not have the lock before.
3065                                  */
3066                                 if (ncp->nc_parent != par_nch->ncp ||
3067                                     ncp->nc_nlen != nlc->nlc_namelen ||
3068                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3069                                          ncp->nc_nlen) ||
3070                                     (ncp->nc_flag & NCF_DESTROYED)) {
3071                                         _cache_put(ncp);
3072                                         goto restart;
3073                                 }
3074                                 _cache_auto_unresolve(mp, ncp);
3075                                 if (new_ncp)
3076                                         _cache_free(new_ncp);
3077                                 goto found;
3078                         }
3079                         _cache_get(ncp);        /* cycle the lock to block */
3080                         _cache_put(ncp);
3081                         _cache_drop(ncp);
3082                         goto restart;
3083                 }
3084         }
3085
3086         /*
3087          * We failed to locate an entry, create a new entry and add it to
3088          * the cache.  The parent ncp must also be locked so we
3089          * can link into it.
3090          *
3091          * We have to relookup after possibly blocking in kmalloc or
3092          * when locking par_nch.
3093          *
3094          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3095          *       mount case, in which case nc_name will be NULL.
3096          */
3097         if (new_ncp == NULL) {
3098                 spin_unlock_shared(&nchpp->spin);
3099                 new_ncp = cache_alloc(nlc->nlc_namelen);
3100                 if (nlc->nlc_namelen) {
3101                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3102                               nlc->nlc_namelen);
3103                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3104                 }
3105                 goto restart;
3106         }
3107
3108         /*
3109          * NOTE! The spinlock is held exclusively here because new_ncp
3110          *       is non-NULL.
3111          */
3112         if (par_locked == 0) {
3113                 spin_unlock(&nchpp->spin);
3114                 _cache_lock(par_nch->ncp);
3115                 par_locked = 1;
3116                 goto restart;
3117         }
3118
3119         /*
3120          * WARNING!  We still hold the spinlock.  We have to set the hash
3121          *           table entry atomically.
3122          */
3123         ncp = new_ncp;
3124         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3125         spin_unlock(&nchpp->spin);
3126         _cache_unlock(par_nch->ncp);
3127         /* par_locked = 0 - not used */
3128 found:
3129         /*
3130          * stats and namecache size management
3131          */
3132         if (ncp->nc_flag & NCF_UNRESOLVED)
3133                 ++gd->gd_nchstats->ncs_miss;
3134         else if (ncp->nc_vp)
3135                 ++gd->gd_nchstats->ncs_goodhits;
3136         else
3137                 ++gd->gd_nchstats->ncs_neghits;
3138         nch.mount = mp;
3139         nch.ncp = ncp;
3140         _cache_mntref(nch.mount);
3141
3142         return(nch);
3143 }
3144
3145 /*
3146  * Attempt to lookup a namecache entry and return with a shared namecache
3147  * lock.
3148  */
3149 int
3150 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
3151                            int excl, struct nchandle *res_nch)
3152 {
3153         struct namecache *ncp;
3154         struct nchash_head *nchpp;
3155         struct mount *mp;
3156         u_int32_t hash;
3157         globaldata_t gd;
3158
3159         /*
3160          * If exclusive requested or shared namecache locks are disabled,
3161          * return failure.
3162          */
3163         if (ncp_shared_lock_disable || excl)
3164                 return(EWOULDBLOCK);
3165
3166         gd = mycpu;
3167         mp = par_nch->mount;
3168
3169         /*
3170          * This is a good time to call it, no ncp's are locked by
3171          * the caller or us.
3172          */
3173         cache_hysteresis(1);
3174
3175         /*
3176          * Try to locate an existing entry
3177          */
3178         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3179         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3180         nchpp = NCHHASH(hash);
3181
3182         spin_lock_shared(&nchpp->spin);
3183
3184         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
3185                 /*
3186                  * Break out if we find a matching entry.  Note that
3187                  * UNRESOLVED entries may match, but DESTROYED entries
3188                  * do not.
3189                  */
3190                 if (ncp->nc_parent == par_nch->ncp &&
3191                     ncp->nc_nlen == nlc->nlc_namelen &&
3192                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3193                     (ncp->nc_flag & NCF_DESTROYED) == 0
3194                 ) {
3195                         _cache_hold(ncp);
3196                         spin_unlock_shared(&nchpp->spin);
3197                         if (_cache_lock_shared_special(ncp) == 0) {
3198                                 if (ncp->nc_parent == par_nch->ncp &&
3199                                     ncp->nc_nlen == nlc->nlc_namelen &&
3200                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3201                                          ncp->nc_nlen) == 0 &&
3202                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3203                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3204                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
3205                                         goto found;
3206                                 }
3207                                 _cache_unlock(ncp);
3208                         }
3209                         _cache_drop(ncp);
3210                         spin_lock_shared(&nchpp->spin);
3211                         break;
3212                 }
3213         }
3214
3215         /*
3216          * Failure
3217          */
3218         spin_unlock_shared(&nchpp->spin);
3219         return(EWOULDBLOCK);
3220
3221         /*
3222          * Success
3223          *
3224          * Note that nc_error might be non-zero (e.g ENOENT).
3225          */
3226 found:
3227         res_nch->mount = mp;
3228         res_nch->ncp = ncp;
3229         ++gd->gd_nchstats->ncs_goodhits;
3230         _cache_mntref(res_nch->mount);
3231
3232         KKASSERT(ncp->nc_error != EWOULDBLOCK);
3233         return(ncp->nc_error);
3234 }
3235
3236 /*
3237  * This is a non-blocking verison of cache_nlookup() used by
3238  * nfs_readdirplusrpc_uio().  It can fail for any reason and
3239  * will return nch.ncp == NULL in that case.
3240  */
3241 struct nchandle
3242 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3243 {
3244         struct nchandle nch;
3245         struct namecache *ncp;
3246         struct namecache *new_ncp;
3247         struct nchash_head *nchpp;
3248         struct mount *mp;
3249         u_int32_t hash;
3250         globaldata_t gd;
3251         int par_locked;
3252
3253         gd = mycpu;
3254         mp = par_nch->mount;
3255         par_locked = 0;
3256
3257         /*
3258          * Try to locate an existing entry
3259          */
3260         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3261         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3262         new_ncp = NULL;
3263         nchpp = NCHHASH(hash);
3264 restart:
3265         spin_lock(&nchpp->spin);
3266         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
3267                 /*
3268                  * Break out if we find a matching entry.  Note that
3269                  * UNRESOLVED entries may match, but DESTROYED entries
3270                  * do not.
3271                  */
3272                 if (ncp->nc_parent == par_nch->ncp &&
3273                     ncp->nc_nlen == nlc->nlc_namelen &&
3274                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3275                     (ncp->nc_flag & NCF_DESTROYED) == 0
3276                 ) {
3277                         _cache_hold(ncp);
3278                         spin_unlock(&nchpp->spin);
3279                         if (par_locked) {
3280                                 _cache_unlock(par_nch->ncp);
3281                                 par_locked = 0;
3282                         }
3283                         if (_cache_lock_special(ncp) == 0) {
3284                                 if (ncp->nc_parent != par_nch->ncp ||
3285                                     ncp->nc_nlen != nlc->nlc_namelen ||
3286                                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3287                                     (ncp->nc_flag & NCF_DESTROYED)) {
3288                                         kprintf("cache_lookup_nonblock: "
3289                                                 "ncp-race %p %*.*s\n",
3290                                                 ncp,
3291                                                 nlc->nlc_namelen,
3292                                                 nlc->nlc_namelen,
3293                                                 nlc->nlc_nameptr);
3294                                         _cache_unlock(ncp);
3295                                         _cache_drop(ncp);
3296                                         goto failed;
3297                                 }
3298                                 _cache_auto_unresolve(mp, ncp);
3299                                 if (new_ncp) {
3300                                         _cache_free(new_ncp);
3301                                         new_ncp = NULL;
3302                                 }
3303                                 goto found;
3304                         }
3305                         _cache_drop(ncp);
3306                         goto failed;
3307                 }
3308         }
3309
3310         /*
3311          * We failed to locate an entry, create a new entry and add it to
3312          * the cache.  The parent ncp must also be locked so we
3313          * can link into it.
3314          *
3315          * We have to relookup after possibly blocking in kmalloc or
3316          * when locking par_nch.
3317          *
3318          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3319          *       mount case, in which case nc_name will be NULL.
3320          */
3321         if (new_ncp == NULL) {
3322                 spin_unlock(&nchpp->spin);
3323                 new_ncp = cache_alloc(nlc->nlc_namelen);
3324                 if (nlc->nlc_namelen) {
3325                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3326                               nlc->nlc_namelen);
3327                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3328                 }
3329                 goto restart;
3330         }
3331         if (par_locked == 0) {
3332                 spin_unlock(&nchpp->spin);
3333                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3334                         par_locked = 1;
3335                         goto restart;
3336                 }
3337                 goto failed;
3338         }
3339
3340         /*
3341          * WARNING!  We still hold the spinlock.  We have to set the hash
3342          *           table entry atomically.
3343          */
3344         ncp = new_ncp;
3345         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3346         spin_unlock(&nchpp->spin);
3347         _cache_unlock(par_nch->ncp);
3348         /* par_locked = 0 - not used */
3349 found:
3350         /*
3351          * stats and namecache size management
3352          */
3353         if (ncp->nc_flag & NCF_UNRESOLVED)
3354                 ++gd->gd_nchstats->ncs_miss;
3355         else if (ncp->nc_vp)
3356                 ++gd->gd_nchstats->ncs_goodhits;
3357         else
3358                 ++gd->gd_nchstats->ncs_neghits;
3359         nch.mount = mp;
3360         nch.ncp = ncp;
3361         _cache_mntref(nch.mount);
3362
3363         return(nch);
3364 failed:
3365         if (new_ncp) {
3366                 _cache_free(new_ncp);
3367                 new_ncp = NULL;
3368         }
3369         nch.mount = NULL;
3370         nch.ncp = NULL;
3371         return(nch);
3372 }
3373
3374 /*
3375  * The namecache entry is marked as being used as a mount point.
3376  * Locate the mount if it is visible to the caller.  The DragonFly
3377  * mount system allows arbitrary loops in the topology and disentangles
3378  * those loops by matching against (mp, ncp) rather than just (ncp).
3379  * This means any given ncp can dive any number of mounts, depending
3380  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3381  *
3382  * We use a very simple frontend cache to reduce SMP conflicts,
3383  * which we have to do because the mountlist scan needs an exclusive
3384  * lock around its ripout info list.  Not to mention that there might
3385  * be a lot of mounts.
3386  */
3387 struct findmount_info {
3388         struct mount *result;
3389         struct mount *nch_mount;
3390         struct namecache *nch_ncp;
3391 };
3392
3393 #define MNTCACHE_PRIME  66555444443333333ULL
3394
3395 static
3396 struct ncmount_cache *
3397 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3398 {
3399         uintptr_t hash;
3400
3401         hash = (uintptr_t)mp + ((uintptr_t)mp >> 18);
3402         hash %= MNTCACHE_PRIME;
3403         hash ^= (uintptr_t)ncp + ((uintptr_t)ncp >> 18);
3404         hash %= MNTCACHE_PRIME;
3405         hash = hash % NCMOUNT_NUMCACHE;
3406
3407         return (&ncmount_cache[hash]);
3408 }
3409
3410 static
3411 int
3412 cache_findmount_callback(struct mount *mp, void *data)
3413 {
3414         struct findmount_info *info = data;
3415
3416         /*
3417          * Check the mount's mounted-on point against the passed nch.
3418          */
3419         if (mp->mnt_ncmounton.mount == info->nch_mount &&
3420             mp->mnt_ncmounton.ncp == info->nch_ncp
3421         ) {
3422             info->result = mp;
3423             _cache_mntref(mp);
3424             return(-1);
3425         }
3426         return(0);
3427 }
3428
3429 struct mount *
3430 cache_findmount(struct nchandle *nch)
3431 {
3432         struct findmount_info info;
3433         struct ncmount_cache *ncc;
3434         struct mount *mp;
3435
3436         /*
3437          * Fast
3438          */
3439         if (ncmount_cache_enable == 0) {
3440                 ncc = NULL;
3441                 goto skip;
3442         }
3443         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3444         if (ncc->ncp == nch->ncp) {
3445                 spin_lock_shared(&ncc->spin);
3446                 if (ncc->isneg == 0 &&
3447                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
3448                         if (mp->mnt_ncmounton.mount == nch->mount &&
3449                             mp->mnt_ncmounton.ncp == nch->ncp) {
3450                                 /*
3451                                  * Cache hit (positive)
3452                                  */
3453                                 _cache_mntref(mp);
3454                                 spin_unlock_shared(&ncc->spin);
3455                                 return(mp);
3456                         }
3457                         /* else cache miss */
3458                 }
3459                 if (ncc->isneg &&
3460                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3461                         /*
3462                          * Cache hit (negative)
3463                          */
3464                         spin_unlock_shared(&ncc->spin);
3465                         return(NULL);
3466                 }
3467                 spin_unlock_shared(&ncc->spin);
3468         }
3469 skip:
3470
3471         /*
3472          * Slow
3473          */
3474         info.result = NULL;
3475         info.nch_mount = nch->mount;
3476         info.nch_ncp = nch->ncp;
3477         mountlist_scan(cache_findmount_callback, &info,
3478                        MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
3479
3480         /*
3481          * Cache the result.
3482          *
3483          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
3484          *                   only used for pointer comparisons and is not
3485          *                   referenced (otherwise there would be dangling
3486          *                   refs).
3487          *
3488          * Positive lookups: We cache the originating {ncp} and the target
3489          *                   (mp).  (mp) is referenced.
3490          *
3491          * Indeterminant:    If the match is undergoing an unmount we do
3492          *                   not cache it to avoid racing cache_unmounting(),
3493          *                   but still return the match.
3494          */
3495         if (ncc) {
3496                 spin_lock(&ncc->spin);
3497                 if (info.result == NULL) {
3498                         if (ncc->isneg == 0 && ncc->mp)
3499                                 _cache_mntrel(ncc->mp);
3500                         ncc->ncp = nch->ncp;
3501                         ncc->mp = nch->mount;
3502                         ncc->isneg = 1;
3503                         spin_unlock(&ncc->spin);
3504                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
3505                         if (ncc->isneg == 0 && ncc->mp)
3506                                 _cache_mntrel(ncc->mp);
3507                         _cache_mntref(info.result);
3508                         ncc->ncp = nch->ncp;
3509                         ncc->mp = info.result;
3510                         ncc->isneg = 0;
3511                         spin_unlock(&ncc->spin);
3512                 } else {
3513                         spin_unlock(&ncc->spin);
3514                 }
3515         }
3516         return(info.result);
3517 }
3518
3519 void
3520 cache_dropmount(struct mount *mp)
3521 {
3522         _cache_mntrel(mp);
3523 }
3524
3525 void
3526 cache_ismounting(struct mount *mp)
3527 {
3528         struct nchandle *nch = &mp->mnt_ncmounton;
3529         struct ncmount_cache *ncc;
3530
3531         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3532         if (ncc->isneg &&
3533             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3534                 spin_lock(&ncc->spin);
3535                 if (ncc->isneg &&
3536                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3537                         ncc->ncp = NULL;
3538                         ncc->mp = NULL;
3539                 }
3540                 spin_unlock(&ncc->spin);
3541         }
3542 }
3543
3544 void
3545 cache_unmounting(struct mount *mp)
3546 {
3547         struct nchandle *nch = &mp->mnt_ncmounton;
3548         struct ncmount_cache *ncc;
3549
3550         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3551         if (ncc->isneg == 0 &&
3552             ncc->ncp == nch->ncp && ncc->mp == mp) {
3553                 spin_lock(&ncc->spin);
3554                 if (ncc->isneg == 0 &&
3555                     ncc->ncp == nch->ncp && ncc->mp == mp) {
3556                         _cache_mntrel(mp);
3557                         ncc->ncp = NULL;
3558                         ncc->mp = NULL;
3559                 }
3560                 spin_unlock(&ncc->spin);
3561         }
3562 }
3563
3564 /*
3565  * Resolve an unresolved namecache entry, generally by looking it up.
3566  * The passed ncp must be locked and refd.
3567  *
3568  * Theoretically since a vnode cannot be recycled while held, and since
3569  * the nc_parent chain holds its vnode as long as children exist, the
3570  * direct parent of the cache entry we are trying to resolve should
3571  * have a valid vnode.  If not then generate an error that we can
3572  * determine is related to a resolver bug.
3573  *
3574  * However, if a vnode was in the middle of a recyclement when the NCP
3575  * got locked, ncp->nc_vp might point to a vnode that is about to become
3576  * invalid.  cache_resolve() handles this case by unresolving the entry
3577  * and then re-resolving it.
3578  *
3579  * Note that successful resolution does not necessarily return an error
3580  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3581  * will be returned.
3582  */
3583 int
3584 cache_resolve(struct nchandle *nch, struct ucred *cred)
3585 {
3586         struct namecache *par_tmp;
3587         struct namecache *par;
3588         struct namecache *ncp;
3589         struct nchandle nctmp;
3590         struct mount *mp;
3591         struct vnode *dvp;
3592         int error;
3593
3594         ncp = nch->ncp;
3595         mp = nch->mount;
3596         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3597 restart:
3598         /*
3599          * If the ncp is already resolved we have nothing to do.  However,
3600          * we do want to guarentee that a usable vnode is returned when
3601          * a vnode is present, so make sure it hasn't been reclaimed.
3602          */
3603         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3604                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3605                         _cache_setunresolved(ncp);
3606                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3607                         return (ncp->nc_error);
3608         }
3609
3610         /*
3611          * If the ncp was destroyed it will never resolve again.  This
3612          * can basically only happen when someone is chdir'd into an
3613          * empty directory which is then rmdir'd.  We want to catch this
3614          * here and not dive the VFS because the VFS might actually
3615          * have a way to re-resolve the disconnected ncp, which will
3616          * result in inconsistencies in the cdir/nch for proc->p_fd.
3617          */
3618         if (ncp->nc_flag & NCF_DESTROYED)
3619                 return(EINVAL);
3620
3621         /*
3622          * Mount points need special handling because the parent does not
3623          * belong to the same filesystem as the ncp.
3624          */
3625         if (ncp == mp->mnt_ncmountpt.ncp)
3626                 return (cache_resolve_mp(mp));
3627
3628         /*
3629          * We expect an unbroken chain of ncps to at least the mount point,
3630          * and even all the way to root (but this code doesn't have to go
3631          * past the mount point).
3632          */
3633         if (ncp->nc_parent == NULL) {
3634                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3635                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3636                 ncp->nc_error = EXDEV;
3637                 return(ncp->nc_error);
3638         }
3639
3640         /*
3641          * The vp's of the parent directories in the chain are held via vhold()
3642          * due to the existance of the child, and should not disappear.
3643          * However, there are cases where they can disappear:
3644          *
3645          *      - due to filesystem I/O errors.
3646          *      - due to NFS being stupid about tracking the namespace and
3647          *        destroys the namespace for entire directories quite often.
3648          *      - due to forced unmounts.
3649          *      - due to an rmdir (parent will be marked DESTROYED)
3650          *
3651          * When this occurs we have to track the chain backwards and resolve
3652          * it, looping until the resolver catches up to the current node.  We
3653          * could recurse here but we might run ourselves out of kernel stack
3654          * so we do it in a more painful manner.  This situation really should
3655          * not occur all that often, or if it does not have to go back too
3656          * many nodes to resolve the ncp.
3657          */
3658         while ((dvp = cache_dvpref(ncp)) == NULL) {
3659                 /*
3660                  * This case can occur if a process is CD'd into a
3661                  * directory which is then rmdir'd.  If the parent is marked
3662                  * destroyed there is no point trying to resolve it.
3663                  */
3664                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3665                         return(ENOENT);
3666                 par = ncp->nc_parent;
3667                 _cache_hold(par);
3668                 _cache_lock(par);
3669                 while ((par_tmp = par->nc_parent) != NULL &&
3670                        par_tmp->nc_vp == NULL) {
3671                         _cache_hold(par_tmp);
3672                         _cache_lock(par_tmp);
3673                         _cache_put(par);
3674                         par = par_tmp;
3675                 }
3676                 if (par->nc_parent == NULL) {
3677                         kprintf("EXDEV case 2 %*.*s\n",
3678                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3679                         _cache_put(par);
3680                         return (EXDEV);
3681                 }
3682                 /*
3683                  * The parent is not set in stone, ref and lock it to prevent
3684                  * it from disappearing.  Also note that due to renames it
3685                  * is possible for our ncp to move and for par to no longer
3686                  * be one of its parents.  We resolve it anyway, the loop
3687                  * will handle any moves.
3688                  */
3689                 _cache_get(par);        /* additional hold/lock */
3690                 _cache_put(par);        /* from earlier hold/lock */
3691                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3692                         cache_resolve_mp(nch->mount);
3693                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3694                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
3695                         _cache_put(par);
3696                         continue;
3697                 } else {
3698                         if (par->nc_flag & NCF_UNRESOLVED) {
3699                                 nctmp.mount = mp;
3700                                 nctmp.ncp = par;
3701                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3702                         }
3703                         vrele(dvp);
3704                 }
3705                 if ((error = par->nc_error) != 0) {
3706                         if (par->nc_error != EAGAIN) {
3707                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3708                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3709                                     par->nc_error);
3710                                 _cache_put(par);
3711                                 return(error);
3712                         }
3713                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3714                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3715                 }
3716                 _cache_put(par);
3717                 /* loop */
3718         }
3719
3720         /*
3721          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3722          * ncp's and reattach them.  If this occurs the original ncp is marked
3723          * EAGAIN to force a relookup.
3724          *
3725          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3726          * ncp must already be resolved.
3727          */
3728         if (dvp) {
3729                 nctmp.mount = mp;
3730                 nctmp.ncp = ncp;
3731                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3732                 vrele(dvp);
3733         } else {
3734                 ncp->nc_error = EPERM;
3735         }
3736         if (ncp->nc_error == EAGAIN) {
3737                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3738                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3739                 goto restart;
3740         }
3741         return(ncp->nc_error);
3742 }
3743
3744 /*
3745  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3746  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3747  * re-resolution more often due to its mac-truck-smash-the-namecache
3748  * method of tracking namespace changes.
3749  *
3750  * The semantics for this call is that the passed ncp must be locked on
3751  * entry and will be locked on return.  However, if we actually have to
3752  * resolve the mount point we temporarily unlock the entry in order to
3753  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3754  * the unlock we have to recheck the flags after we relock.
3755  */
3756 static int
3757 cache_resolve_mp(struct mount *mp)
3758 {
3759         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3760         struct vnode *vp;
3761         int error;
3762
3763         KKASSERT(mp != NULL);
3764
3765         /*
3766          * If the ncp is already resolved we have nothing to do.  However,
3767          * we do want to guarentee that a usable vnode is returned when
3768          * a vnode is present, so make sure it hasn't been reclaimed.
3769          */
3770         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3771                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3772                         _cache_setunresolved(ncp);
3773         }
3774
3775         if (ncp->nc_flag & NCF_UNRESOLVED) {
3776                 _cache_unlock(ncp);
3777                 while (vfs_busy(mp, 0))
3778                         ;
3779                 error = VFS_ROOT(mp, &vp);
3780                 _cache_lock(ncp);
3781
3782                 /*
3783                  * recheck the ncp state after relocking.
3784                  */
3785                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3786                         ncp->nc_error = error;
3787                         if (error == 0) {
3788                                 _cache_setvp(mp, ncp, vp);
3789                                 vput(vp);
3790                         } else {
3791                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3792                                         " to resolve mount %p err=%d ncp=%p\n",
3793                                         mp, error, ncp);
3794                                 _cache_setvp(mp, ncp, NULL);
3795                         }
3796                 } else if (error == 0) {
3797                         vput(vp);
3798                 }
3799                 vfs_unbusy(mp);
3800         }
3801         return(ncp->nc_error);
3802 }
3803
3804 /*
3805  * Clean out negative cache entries when too many have accumulated.
3806  */
3807 static void
3808 _cache_cleanneg(long count)
3809 {
3810         struct pcpu_ncache *pn;
3811         struct namecache *ncp;
3812         static uint32_t neg_rover;
3813         uint32_t n;
3814         long vnegs;
3815
3816         n = neg_rover++;        /* SMP heuristical, race ok */
3817         cpu_ccfence();
3818         n = n % (uint32_t)ncpus;
3819
3820         /*
3821          * Normalize vfscache_negs and count.  count is sometimes based
3822          * on vfscache_negs.  vfscache_negs is heuristical and can sometimes
3823          * have crazy values.
3824          */
3825         vnegs = vfscache_negs;
3826         cpu_ccfence();
3827         if (vnegs <= MINNEG)
3828                 vnegs = MINNEG;
3829         if (count < 1)
3830                 count = 1;
3831
3832         pn = &pcpu_ncache[n];
3833         spin_lock(&pn->neg_spin);
3834         count = pn->neg_count * count / vnegs + 1;
3835         spin_unlock(&pn->neg_spin);
3836
3837         /*
3838          * Attempt to clean out the specified number of negative cache
3839          * entries.
3840          */
3841         while (count > 0) {
3842                 spin_lock(&pn->neg_spin);
3843                 ncp = TAILQ_FIRST(&pn->neg_list);
3844                 if (ncp == NULL) {
3845                         spin_unlock(&pn->neg_spin);
3846                         break;
3847                 }
3848                 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
3849                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
3850                 _cache_hold(ncp);
3851                 spin_unlock(&pn->neg_spin);
3852
3853                 /*
3854                  * This can race, so we must re-check that the ncp
3855                  * is on the ncneg.list after successfully locking it.
3856                  */
3857                 if (_cache_lock_special(ncp) == 0) {
3858                         if (ncp->nc_vp == NULL &&
3859                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3860                                 ncp = cache_zap(ncp, 1);
3861                                 if (ncp)
3862                                         _cache_drop(ncp);
3863                         } else {
3864                                 _cache_unlock(ncp);
3865                                 _cache_drop(ncp);
3866                         }
3867                 } else {
3868                         _cache_drop(ncp);
3869                 }
3870                 --count;
3871         }
3872 }
3873
3874 /*
3875  * Clean out positive cache entries when too many have accumulated.
3876  */
3877 static void
3878 _cache_cleanpos(long count)
3879 {
3880         static volatile int rover;
3881         struct nchash_head *nchpp;
3882         struct namecache *ncp;
3883         int rover_copy;
3884
3885         /*
3886          * Attempt to clean out the specified number of negative cache
3887          * entries.
3888          */
3889         while (count > 0) {
3890                 rover_copy = ++rover;   /* MPSAFEENOUGH */
3891                 cpu_ccfence();
3892                 nchpp = NCHHASH(rover_copy);
3893
3894                 spin_lock_shared(&nchpp->spin);
3895                 ncp = LIST_FIRST(&nchpp->list);
3896                 while (ncp && (ncp->nc_flag & NCF_DESTROYED))
3897                         ncp = LIST_NEXT(ncp, nc_hash);
3898                 if (ncp)
3899                         _cache_hold(ncp);
3900                 spin_unlock_shared(&nchpp->spin);
3901
3902                 if (ncp) {
3903                         if (_cache_lock_special(ncp) == 0) {
3904                                 ncp = cache_zap(ncp, 1);
3905                                 if (ncp)
3906                                         _cache_drop(ncp);
3907                         } else {
3908                                 _cache_drop(ncp);
3909                         }
3910                 }
3911                 --count;
3912         }
3913 }
3914
3915 /*
3916  * This is a kitchen sink function to clean out ncps which we
3917  * tried to zap from cache_drop() but failed because we were
3918  * unable to acquire the parent lock.
3919  *
3920  * Such entries can also be removed via cache_inval_vp(), such
3921  * as when unmounting.
3922  */
3923 static void
3924 _cache_cleandefered(void)
3925 {
3926         struct nchash_head *nchpp;
3927         struct namecache *ncp;
3928         struct namecache dummy;
3929         int i;
3930
3931         numdefered = 0;
3932         bzero(&dummy, sizeof(dummy));
3933         dummy.nc_flag = NCF_DESTROYED;
3934         dummy.nc_refs = 1;
3935
3936         for (i = 0; i <= nchash; ++i) {
3937                 nchpp = &nchashtbl[i];
3938
3939                 spin_lock(&nchpp->spin);
3940                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
3941                 ncp = &dummy;
3942                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
3943                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
3944                                 continue;
3945                         LIST_REMOVE(&dummy, nc_hash);
3946                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
3947                         _cache_hold(ncp);
3948                         spin_unlock(&nchpp->spin);
3949                         if (_cache_lock_nonblock(ncp) == 0) {
3950                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
3951                                 _cache_unlock(ncp);
3952                         }
3953                         _cache_drop(ncp);
3954                         spin_lock(&nchpp->spin);
3955                         ncp = &dummy;
3956                 }
3957                 LIST_REMOVE(&dummy, nc_hash);
3958                 spin_unlock(&nchpp->spin);
3959         }
3960 }
3961
3962 /*
3963  * Name cache initialization, from vfsinit() when we are booting
3964  */
3965 void
3966 nchinit(void)
3967 {
3968         struct pcpu_ncache *pn;
3969         globaldata_t gd;
3970         int i;
3971
3972         /*
3973          * Per-cpu accounting and negative hit list
3974          */
3975         pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
3976                               M_VFSCACHE, M_WAITOK|M_ZERO);
3977         for (i = 0; i < ncpus; ++i) {
3978                 pn = &pcpu_ncache[i];
3979                 TAILQ_INIT(&pn->neg_list);
3980                 spin_init(&pn->neg_spin, "ncneg");
3981         }
3982
3983         /*
3984          * Initialise per-cpu namecache effectiveness statistics.
3985          */
3986         for (i = 0; i < ncpus; ++i) {
3987                 gd = globaldata_find(i);
3988                 gd->gd_nchstats = &nchstats[i];
3989         }
3990
3991         /*
3992          * Create a generous namecache hash table
3993          */
3994         nchashtbl = hashinit_ext(vfs_inodehashsize(),
3995                                  sizeof(struct nchash_head),
3996                                  M_VFSCACHE, &nchash);
3997         for (i = 0; i <= (int)nchash; ++i) {
3998                 LIST_INIT(&nchashtbl[i].list);
3999                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4000         }
4001         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4002                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4003         nclockwarn = 5 * hz;
4004 }
4005
4006 /*
4007  * Called from start_init() to bootstrap the root filesystem.  Returns
4008  * a referenced, unlocked namecache record.
4009  */
4010 void
4011 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4012 {
4013         nch->ncp = cache_alloc(0);
4014         nch->mount = mp;
4015         _cache_mntref(mp);
4016         if (vp)
4017                 _cache_setvp(nch->mount, nch->ncp, vp);
4018 }
4019
4020 /*
4021  * vfs_cache_setroot()
4022  *
4023  *      Create an association between the root of our namecache and
4024  *      the root vnode.  This routine may be called several times during
4025  *      booting.
4026  *
4027  *      If the caller intends to save the returned namecache pointer somewhere
4028  *      it must cache_hold() it.
4029  */
4030 void
4031 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4032 {
4033         struct vnode *ovp;
4034         struct nchandle onch;
4035
4036         ovp = rootvnode;
4037         onch = rootnch;
4038         rootvnode = nvp;
4039         if (nch)
4040                 rootnch = *nch;
4041         else
4042                 cache_zero(&rootnch);
4043         if (ovp)
4044                 vrele(ovp);
4045         if (onch.ncp)
4046                 cache_drop(&onch);
4047 }
4048
4049 /*
4050  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
4051  * topology and is being removed as quickly as possible.  The new VOP_N*()
4052  * API calls are required to make specific adjustments using the supplied
4053  * ncp pointers rather then just bogusly purging random vnodes.
4054  *
4055  * Invalidate all namecache entries to a particular vnode as well as
4056  * any direct children of that vnode in the namecache.  This is a
4057  * 'catch all' purge used by filesystems that do not know any better.
4058  *
4059  * Note that the linkage between the vnode and its namecache entries will
4060  * be removed, but the namecache entries themselves might stay put due to
4061  * active references from elsewhere in the system or due to the existance of
4062  * the children.   The namecache topology is left intact even if we do not
4063  * know what the vnode association is.  Such entries will be marked
4064  * NCF_UNRESOLVED.
4065  */
4066 void
4067 cache_purge(struct vnode *vp)
4068 {
4069         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
4070 }
4071
4072 static int disablecwd;
4073 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
4074     "Disable getcwd");
4075
4076 static u_long numcwdcalls;
4077 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
4078     "Number of current directory resolution calls");
4079 static u_long numcwdfailnf;
4080 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
4081     "Number of current directory failures due to lack of file");
4082 static u_long numcwdfailsz;
4083 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
4084     "Number of current directory failures due to large result");
4085 static u_long numcwdfound;
4086 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
4087     "Number of current directory resolution successes");
4088
4089 /*
4090  * MPALMOSTSAFE
4091  */
4092 int
4093 sys___getcwd(struct __getcwd_args *uap)
4094 {
4095         u_int buflen;
4096         int error;
4097         char *buf;
4098         char *bp;
4099
4100         if (disablecwd)
4101                 return (ENODEV);
4102
4103         buflen = uap->buflen;
4104         if (buflen == 0)
4105                 return (EINVAL);
4106         if (buflen > MAXPATHLEN)
4107                 buflen = MAXPATHLEN;
4108
4109         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
4110         bp = kern_getcwd(buf, buflen, &error);
4111         if (error == 0)
4112                 error = copyout(bp, uap->buf, strlen(bp) + 1);
4113         kfree(buf, M_TEMP);
4114         return (error);
4115 }
4116
4117 char *
4118 kern_getcwd(char *buf, size_t buflen, int *error)
4119 {
4120         struct proc *p = curproc;
4121         char *bp;
4122         int i, slash_prefixed;
4123         struct filedesc *fdp;
4124         struct nchandle nch;
4125         struct namecache *ncp;
4126
4127         numcwdcalls++;
4128         bp = buf;
4129         bp += buflen - 1;
4130         *bp = '\0';
4131         fdp = p->p_fd;
4132         slash_prefixed = 0;
4133
4134         nch = fdp->fd_ncdir;
4135         ncp = nch.ncp;
4136         if (ncp)
4137                 _cache_hold(ncp);
4138
4139         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
4140                nch.mount != fdp->fd_nrdir.mount)
4141         ) {
4142                 /*
4143                  * While traversing upwards if we encounter the root
4144                  * of the current mount we have to skip to the mount point
4145                  * in the underlying filesystem.
4146                  */
4147                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
4148                         nch = nch.mount->mnt_ncmounton;
4149                         _cache_drop(ncp);
4150                         ncp = nch.ncp;
4151                         if (ncp)
4152                                 _cache_hold(ncp);
4153                         continue;
4154                 }
4155
4156                 /*
4157                  * Prepend the path segment
4158                  */
4159                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4160                         if (bp == buf) {
4161                                 numcwdfailsz++;
4162                                 *error = ERANGE;
4163                                 bp = NULL;
4164                                 goto done;
4165                         }
4166                         *--bp = ncp->nc_name[i];
4167                 }
4168                 if (bp == buf) {
4169                         numcwdfailsz++;
4170                         *error = ERANGE;
4171                         bp = NULL;
4172                         goto done;
4173                 }
4174                 *--bp = '/';
4175                 slash_prefixed = 1;
4176
4177                 /*
4178                  * Go up a directory.  This isn't a mount point so we don't
4179                  * have to check again.
4180                  */
4181                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4182                         if (ncp_shared_lock_disable)
4183                                 _cache_lock(ncp);
4184                         else
4185                                 _cache_lock_shared(ncp);
4186                         if (nch.ncp != ncp->nc_parent) {
4187                                 _cache_unlock(ncp);
4188                                 continue;
4189                         }
4190                         _cache_hold(nch.ncp);
4191                         _cache_unlock(ncp);
4192                         break;
4193                 }
4194                 _cache_drop(ncp);
4195                 ncp = nch.ncp;
4196         }
4197         if (ncp == NULL) {
4198                 numcwdfailnf++;
4199                 *error = ENOENT;
4200                 bp = NULL;
4201                 goto done;
4202         }
4203         if (!slash_prefixed) {
4204                 if (bp == buf) {
4205                         numcwdfailsz++;
4206                         *error = ERANGE;
4207                         bp = NULL;
4208                         goto done;
4209                 }
4210                 *--bp = '/';
4211         }
4212         numcwdfound++;
4213         *error = 0;
4214 done:
4215         if (ncp)
4216                 _cache_drop(ncp);
4217         return (bp);
4218 }
4219
4220 /*
4221  * Thus begins the fullpath magic.
4222  *
4223  * The passed nchp is referenced but not locked.
4224  */
4225 static int disablefullpath;
4226 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
4227     &disablefullpath, 0,
4228     "Disable fullpath lookups");
4229
4230 int
4231 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
4232                char **retbuf, char **freebuf, int guess)
4233 {
4234         struct nchandle fd_nrdir;
4235         struct nchandle nch;
4236         struct namecache *ncp;
4237         struct mount *mp, *new_mp;
4238         char *bp, *buf;
4239         int slash_prefixed;
4240         int error = 0;
4241         int i;
4242
4243         *retbuf = NULL;
4244         *freebuf = NULL;
4245
4246         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
4247         bp = buf + MAXPATHLEN - 1;
4248         *bp = '\0';
4249         if (nchbase)
4250                 fd_nrdir = *nchbase;
4251         else if (p != NULL)
4252                 fd_nrdir = p->p_fd->fd_nrdir;
4253         else
4254                 fd_nrdir = rootnch;
4255         slash_prefixed = 0;
4256         nch = *nchp;
4257         ncp = nch.ncp;
4258         if (ncp)
4259                 _cache_hold(ncp);
4260         mp = nch.mount;
4261
4262         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
4263                 new_mp = NULL;
4264
4265                 /*
4266                  * If we are asked to guess the upwards path, we do so whenever
4267                  * we encounter an ncp marked as a mountpoint. We try to find
4268                  * the actual mountpoint by finding the mountpoint with this
4269                  * ncp.
4270                  */
4271                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
4272                         new_mp = mount_get_by_nc(ncp);
4273                 }
4274                 /*
4275                  * While traversing upwards if we encounter the root
4276                  * of the current mount we have to skip to the mount point.
4277                  */
4278                 if (ncp == mp->mnt_ncmountpt.ncp) {
4279                         new_mp = mp;
4280                 }
4281                 if (new_mp) {
4282                         nch = new_mp->mnt_ncmounton;
4283                         _cache_drop(ncp);
4284                         ncp = nch.ncp;
4285                         if (ncp)
4286                                 _cache_hold(ncp);
4287                         mp = nch.mount;
4288                         continue;
4289                 }
4290
4291                 /*
4292                  * Prepend the path segment
4293                  */
4294                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4295                         if (bp == buf) {
4296                                 kfree(buf, M_TEMP);
4297                                 error = ENOMEM;
4298                                 goto done;
4299                         }
4300                         *--bp = ncp->nc_name[i];
4301                 }
4302                 if (bp == buf) {
4303                         kfree(buf, M_TEMP);
4304                         error = ENOMEM;
4305                         goto done;
4306                 }
4307                 *--bp = '/';
4308                 slash_prefixed = 1;
4309
4310                 /*
4311                  * Go up a directory.  This isn't a mount point so we don't
4312                  * have to check again.
4313                  *
4314                  * We can only safely access nc_parent with ncp held locked.
4315                  */
4316                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4317                         _cache_lock(ncp);
4318                         if (nch.ncp != ncp->nc_parent) {
4319                                 _cache_unlock(ncp);
4320                                 continue;
4321                         }
4322                         _cache_hold(nch.ncp);
4323                         _cache_unlock(ncp);
4324                         break;
4325                 }
4326                 _cache_drop(ncp);
4327                 ncp = nch.ncp;
4328         }
4329         if (ncp == NULL) {
4330                 kfree(buf, M_TEMP);
4331                 error = ENOENT;
4332                 goto done;
4333         }
4334
4335         if (!slash_prefixed) {
4336                 if (bp == buf) {
4337                         kfree(buf, M_TEMP);
4338                         error = ENOMEM;
4339                         goto done;
4340                 }
4341                 *--bp = '/';
4342         }
4343         *retbuf = bp;
4344         *freebuf = buf;
4345         error = 0;
4346 done:
4347         if (ncp)
4348                 _cache_drop(ncp);
4349         return(error);
4350 }
4351
4352 int
4353 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4354             char **freebuf, int guess)
4355 {
4356         struct namecache *ncp;
4357         struct nchandle nch;
4358         int error;
4359
4360         *freebuf = NULL;
4361         if (disablefullpath)
4362                 return (ENODEV);
4363
4364         if (p == NULL)
4365                 return (EINVAL);
4366
4367         /* vn is NULL, client wants us to use p->p_textvp */
4368         if (vn == NULL) {
4369                 if ((vn = p->p_textvp) == NULL)
4370                         return (EINVAL);
4371         }
4372         spin_lock_shared(&vn->v_spin);
4373         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4374                 if (ncp->nc_nlen)
4375                         break;
4376         }
4377         if (ncp == NULL) {
4378                 spin_unlock_shared(&vn->v_spin);
4379                 return (EINVAL);
4380         }
4381         _cache_hold(ncp);
4382         spin_unlock_shared(&vn->v_spin);
4383
4384         nch.ncp = ncp;
4385         nch.mount = vn->v_mount;
4386         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4387         _cache_drop(ncp);
4388         return (error);
4389 }
4390
4391 void
4392 vfscache_rollup_cpu(struct globaldata *gd)
4393 {
4394         struct pcpu_ncache *pn;
4395         long count;
4396
4397         if (pcpu_ncache == NULL)
4398                 return;
4399         pn = &pcpu_ncache[gd->gd_cpuid];
4400
4401         if (pn->vfscache_count) {
4402                 count = atomic_swap_long(&pn->vfscache_count, 0);
4403                 atomic_add_long(&vfscache_count, count);
4404         }
4405         if (pn->vfscache_leafs) {
4406                 count = atomic_swap_long(&pn->vfscache_leafs, 0);
4407                 atomic_add_long(&vfscache_leafs, count);
4408         }
4409         if (pn->vfscache_negs) {
4410                 count = atomic_swap_long(&pn->vfscache_negs, 0);
4411                 atomic_add_long(&vfscache_negs, count);
4412         }
4413 }
4414
4415 #if 0
4416 static void
4417 vfscache_rollup_all(void)
4418 {
4419         int n;
4420
4421         for (n = 0; n < ncpus; ++n)
4422                 vfscache_rollup_cpu(globaldata_find(n));
4423 }
4424 #endif