sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/mount.h>
  70 #include <sys/vnode.h>
  71 #include <sys/malloc.h>
  72 #include <sys/sysproto.h>
  73 #include <sys/spinlock.h>
  74 #include <sys/proc.h>
  75 #include <sys/namei.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/sysref2.h>
  85 #include <sys/spinlock2.h>
  86 #include <sys/mplock2.h>
  87
  88 #define MAX_RECURSION_DEPTH     64
  89
  90 /*
  91  * Random lookups in the cache are accomplished with a hash table using
  92  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
  93  *
  94  * Negative entries may exist and correspond to resolved namecache
  95  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  96  * will be set if the entry corresponds to a whited-out directory entry
  97  * (verses simply not finding the entry at all).   ncneglist is locked
  98  * with a global spinlock (ncspin).
  99  *
 100  * MPSAFE RULES:
 101  *
 102  * (1) A ncp must be referenced before it can be locked.
 103  *
 104  * (2) A ncp must be locked in order to modify it.
 105  *
 106  * (3) ncp locks are always ordered child -> parent.  That may seem
 107  *     backwards but forward scans use the hash table and thus can hold
 108  *     the parent unlocked when traversing downward.
 109  *
 110  *     This allows insert/rename/delete/dot-dot and other operations
 111  *     to use ncp->nc_parent links.
 112  *
 113  *     This also prevents a locked up e.g. NFS node from creating a
 114  *     chain reaction all the way back to the root vnode / namecache.
 115  *
 116  * (4) parent linkages require both the parent and child to be locked.
 117  */
 118
 119 /*
 120  * Structures associated with name cacheing.
 121  */
 122 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 123 #define MINNEG                  1024
 124 #define MINPOS                  1024
 125 #define NCMOUNT_NUMCACHE        1009    /* prime number */
 126
 127 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 128
 129 LIST_HEAD(nchash_list, namecache);
 130
 131 struct nchash_head {
 132        struct nchash_list list;
 133        struct spinlock  spin;
 134 };
 135
 136 struct ncmount_cache {
 137         struct spinlock spin;
 138         struct namecache *ncp;
 139         struct mount *mp;
 140         int isneg;              /* if != 0 mp is originator and not target */
 141 };
 142
 143 static struct nchash_head       *nchashtbl;
 144 static struct namecache_list    ncneglist;
 145 static struct spinlock          ncspin;
 146 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 147
 148 /*
 149  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 150  * to create the namecache infrastructure leading to a dangling vnode.
 151  *
 152  * 0    Only errors are reported
 153  * 1    Successes are reported
 154  * 2    Successes + the whole directory scan is reported
 155  * 3    Force the directory scan code run as if the parent vnode did not
 156  *      have a namecache record, even if it does have one.
 157  */
 158 static int      ncvp_debug;
 159 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 160     "Namecache debug level (0-3)");
 161
 162 static u_long   nchash;                 /* size of hash table */
 163 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 164     "Size of namecache hash table");
 165
 166 static int      ncnegflush = 10;        /* burst for negative flush */
 167 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 168     "Batch flush negative entries");
 169
 170 static int      ncposflush = 10;        /* burst for positive flush */
 171 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 172     "Batch flush positive entries");
 173
 174 static int      ncnegfactor = 16;       /* ratio of negative entries */
 175 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 176     "Ratio of namecache negative entries");
 177
 178 static int      nclockwarn;             /* warn on locked entries in ticks */
 179 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 180     "Warn on locked namecache entries in ticks");
 181
 182 static int      numdefered;             /* number of cache entries allocated */
 183 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 184     "Number of cache entries allocated");
 185
 186 static int      ncposlimit;             /* number of cache entries allocated */
 187 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 188     "Number of cache entries allocated");
 189
 190 static int      ncp_shared_lock_disable = 0;
 191 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 192            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 193
 194 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 195     "sizeof(struct vnode)");
 196 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 197     "sizeof(struct namecache)");
 198
 199 static int      ncmount_cache_enable = 1;
 200 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 201            &ncmount_cache_enable, 0, "mount point cache");
 202 static long     ncmount_cache_hit;
 203 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW,
 204             &ncmount_cache_hit, 0, "mpcache hits");
 205 static long     ncmount_cache_miss;
 206 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW,
 207             &ncmount_cache_miss, 0, "mpcache misses");
 208 static long     ncmount_cache_overwrite;
 209 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
 210             &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
 211
 212 static int cache_resolve_mp(struct mount *mp);
 213 static struct vnode *cache_dvpref(struct namecache *ncp);
 214 static void _cache_lock(struct namecache *ncp);
 215 static void _cache_setunresolved(struct namecache *ncp);
 216 static void _cache_cleanneg(int count);
 217 static void _cache_cleanpos(int count);
 218 static void _cache_cleandefered(void);
 219 static void _cache_unlink(struct namecache *ncp);
 220
 221 /*
 222  * The new name cache statistics
 223  */
 224 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 225 static int numneg;
 226 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
 227     "Number of negative namecache entries");
 228 static int numcache;
 229 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
 230     "Number of namecaches entries");
 231 static u_long numcalls;
 232 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
 233     "Number of namecache lookups");
 234 static u_long numchecks;
 235 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
 236     "Number of checked entries in namecache lookups");
 237
 238 struct nchstats nchstats[SMP_MAXCPU];
 239 /*
 240  * Export VFS cache effectiveness statistics to user-land.
 241  *
 242  * The statistics are left for aggregation to user-land so
 243  * neat things can be achieved, like observing per-CPU cache
 244  * distribution.
 245  */
 246 static int
 247 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 248 {
 249         struct globaldata *gd;
 250         int i, error;
 251
 252         error = 0;
 253         for (i = 0; i < ncpus; ++i) {
 254                 gd = globaldata_find(i);
 255                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 256                         sizeof(struct nchstats))))
 257                         break;
 258         }
 259
 260         return (error);
 261 }
 262 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 263   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 264
 265 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 266
 267 /*
 268  * Namespace locking.  The caller must already hold a reference to the
 269  * namecache structure in order to lock/unlock it.  This function prevents
 270  * the namespace from being created or destroyed by accessors other then
 271  * the lock holder.
 272  *
 273  * Note that holding a locked namecache structure prevents other threads
 274  * from making namespace changes (e.g. deleting or creating), prevents
 275  * vnode association state changes by other threads, and prevents the
 276  * namecache entry from being resolved or unresolved by other threads.
 277  *
 278  * An exclusive lock owner has full authority to associate/disassociate
 279  * vnodes and resolve/unresolve the locked ncp.
 280  *
 281  * A shared lock owner only has authority to acquire the underlying vnode,
 282  * if any.
 283  *
 284  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 285  * fact (when locking) or cleared prior to unlocking.
 286  *
 287  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 288  *           or recycled, but it does NOT help you if the vnode had already
 289  *           initiated a recyclement.  If this is important, use cache_get()
 290  *           rather then cache_lock() (and deal with the differences in the
 291  *           way the refs counter is handled).  Or, alternatively, make an
 292  *           unconditional call to cache_validate() or cache_resolve()
 293  *           after cache_lock() returns.
 294  */
 295 static
 296 void
 297 _cache_lock(struct namecache *ncp)
 298 {
 299         thread_t td;
 300         int didwarn;
 301         int begticks;
 302         int error;
 303         u_int count;
 304
 305         KKASSERT(ncp->nc_refs != 0);
 306         didwarn = 0;
 307         begticks = 0;
 308         td = curthread;
 309
 310         for (;;) {
 311                 count = ncp->nc_lockstatus;
 312                 cpu_ccfence();
 313
 314                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 315                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 316                                               count, count + 1)) {
 317                                 /*
 318                                  * The vp associated with a locked ncp must
 319                                  * be held to prevent it from being recycled.
 320                                  *
 321                                  * WARNING!  If VRECLAIMED is set the vnode
 322                                  * could already be in the middle of a recycle.
 323                                  * Callers must use cache_vref() or
 324                                  * cache_vget() on the locked ncp to
 325                                  * validate the vp or set the cache entry
 326                                  * to unresolved.
 327                                  *
 328                                  * NOTE! vhold() is allowed if we hold a
 329                                  *       lock on the ncp (which we do).
 330                                  */
 331                                 ncp->nc_locktd = td;
 332                                 if (ncp->nc_vp)
 333                                         vhold(ncp->nc_vp);
 334                                 break;
 335                         }
 336                         /* cmpset failed */
 337                         continue;
 338                 }
 339                 if (ncp->nc_locktd == td) {
 340                         KKASSERT((count & NC_SHLOCK_FLAG) == 0);
 341                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 342                                               count, count + 1)) {
 343                                 break;
 344                         }
 345                         /* cmpset failed */
 346                         continue;
 347                 }
 348                 tsleep_interlock(&ncp->nc_locktd, 0);
 349                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 350                                       count | NC_EXLOCK_REQ) == 0) {
 351                         /* cmpset failed */
 352                         continue;
 353                 }
 354                 if (begticks == 0)
 355                         begticks = ticks;
 356                 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
 357                                "clock", nclockwarn);
 358                 if (error == EWOULDBLOCK) {
 359                         if (didwarn == 0) {
 360                                 didwarn = ticks;
 361                                 kprintf("[diagnostic] cache_lock: "
 362                                         "blocked on %p %08x",
 363                                         ncp, count);
 364                                 kprintf(" \"%*.*s\"\n",
 365                                         ncp->nc_nlen, ncp->nc_nlen,
 366                                         ncp->nc_name);
 367                         }
 368                 }
 369                 /* loop */
 370         }
 371         if (didwarn) {
 372                 kprintf("[diagnostic] cache_lock: unblocked %*.*s after "
 373                         "%d secs\n",
 374                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 375                         (int)(ticks + (hz / 2) - begticks) / hz);
 376         }
 377 }
 378
 379 /*
 380  * The shared lock works similarly to the exclusive lock except
 381  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 382  * prevent vhold() races, since the moment our cmpset_int succeeds
 383  * another cpu can come in and get its own shared lock.
 384  *
 385  * A critical section is needed to prevent interruption during the
 386  * VHOLD interlock.
 387  */
 388 static
 389 void
 390 _cache_lock_shared(struct namecache *ncp)
 391 {
 392         int didwarn;
 393         int error;
 394         u_int count;
 395         u_int optreq = NC_EXLOCK_REQ;
 396
 397         KKASSERT(ncp->nc_refs != 0);
 398         didwarn = 0;
 399
 400         for (;;) {
 401                 count = ncp->nc_lockstatus;
 402                 cpu_ccfence();
 403
 404                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 405                         crit_enter();
 406                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 407                                       count,
 408                                       (count + 1) | NC_SHLOCK_FLAG |
 409                                                     NC_SHLOCK_VHOLD)) {
 410                                 /*
 411                                  * The vp associated with a locked ncp must
 412                                  * be held to prevent it from being recycled.
 413                                  *
 414                                  * WARNING!  If VRECLAIMED is set the vnode
 415                                  * could already be in the middle of a recycle.
 416                                  * Callers must use cache_vref() or
 417                                  * cache_vget() on the locked ncp to
 418                                  * validate the vp or set the cache entry
 419                                  * to unresolved.
 420                                  *
 421                                  * NOTE! vhold() is allowed if we hold a
 422                                  *       lock on the ncp (which we do).
 423                                  */
 424                                 if (ncp->nc_vp)
 425                                         vhold(ncp->nc_vp);
 426                                 atomic_clear_int(&ncp->nc_lockstatus,
 427                                                  NC_SHLOCK_VHOLD);
 428                                 crit_exit();
 429                                 break;
 430                         }
 431                         /* cmpset failed */
 432                         crit_exit();
 433                         continue;
 434                 }
 435
 436                 /*
 437                  * If already held shared we can just bump the count, but
 438                  * only allow this if nobody is trying to get the lock
 439                  * exclusively.  If we are blocking too long ignore excl
 440                  * requests (which can race/deadlock us).
 441                  *
 442                  * VHOLD is a bit of a hack.  Even though we successfully
 443                  * added another shared ref, the cpu that got the first
 444                  * shared ref might not yet have held the vnode.
 445                  */
 446                 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) {
 447                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 448                                             NC_SHLOCK_REQ |
 449                                             NC_SHLOCK_FLAG)) > 0);
 450                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 451                                               count, count + 1)) {
 452                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 453                                         cpu_pause();
 454                                 break;
 455                         }
 456                         continue;
 457                 }
 458                 tsleep_interlock(ncp, 0);
 459                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 460                                       count | NC_SHLOCK_REQ) == 0) {
 461                         /* cmpset failed */
 462                         continue;
 463                 }
 464                 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
 465                 if (error == EWOULDBLOCK) {
 466                         optreq = 0;
 467                         if (didwarn == 0) {
 468                                 didwarn = ticks;
 469                                 kprintf("[diagnostic] cache_lock_shared: "
 470                                         "blocked on %p %08x",
 471                                         ncp, count);
 472                                 kprintf(" \"%*.*s\"\n",
 473                                         ncp->nc_nlen, ncp->nc_nlen,
 474                                         ncp->nc_name);
 475                         }
 476                 }
 477                 /* loop */
 478         }
 479         if (didwarn) {
 480                 kprintf("[diagnostic] cache_lock_shared: "
 481                         "unblocked %*.*s after %d secs\n",
 482                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 483                         (int)(ticks - didwarn) / hz);
 484         }
 485 }
 486
 487 /*
 488  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
 489  *       such as the case where one of its children is locked.
 490  */
 491 static
 492 int
 493 _cache_lock_nonblock(struct namecache *ncp)
 494 {
 495         thread_t td;
 496         u_int count;
 497
 498         td = curthread;
 499
 500         for (;;) {
 501                 count = ncp->nc_lockstatus;
 502
 503                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 504                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 505                                               count, count + 1)) {
 506                                 /*
 507                                  * The vp associated with a locked ncp must
 508                                  * be held to prevent it from being recycled.
 509                                  *
 510                                  * WARNING!  If VRECLAIMED is set the vnode
 511                                  * could already be in the middle of a recycle.
 512                                  * Callers must use cache_vref() or
 513                                  * cache_vget() on the locked ncp to
 514                                  * validate the vp or set the cache entry
 515                                  * to unresolved.
 516                                  *
 517                                  * NOTE! vhold() is allowed if we hold a
 518                                  *       lock on the ncp (which we do).
 519                                  */
 520                                 ncp->nc_locktd = td;
 521                                 if (ncp->nc_vp)
 522                                         vhold(ncp->nc_vp);
 523                                 break;
 524                         }
 525                         /* cmpset failed */
 526                         continue;
 527                 }
 528                 if (ncp->nc_locktd == td) {
 529                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 530                                               count, count + 1)) {
 531                                 break;
 532                         }
 533                         /* cmpset failed */
 534                         continue;
 535                 }
 536                 return(EWOULDBLOCK);
 537         }
 538         return(0);
 539 }
 540
 541 /*
 542  * The shared lock works similarly to the exclusive lock except
 543  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 544  * prevent vhold() races, since the moment our cmpset_int succeeds
 545  * another cpu can come in and get its own shared lock.
 546  *
 547  * A critical section is needed to prevent interruption during the
 548  * VHOLD interlock.
 549  */
 550 static
 551 int
 552 _cache_lock_shared_nonblock(struct namecache *ncp)
 553 {
 554         u_int count;
 555
 556         for (;;) {
 557                 count = ncp->nc_lockstatus;
 558
 559                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 560                         crit_enter();
 561                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 562                                       count,
 563                                       (count + 1) | NC_SHLOCK_FLAG |
 564                                                     NC_SHLOCK_VHOLD)) {
 565                                 /*
 566                                  * The vp associated with a locked ncp must
 567                                  * be held to prevent it from being recycled.
 568                                  *
 569                                  * WARNING!  If VRECLAIMED is set the vnode
 570                                  * could already be in the middle of a recycle.
 571                                  * Callers must use cache_vref() or
 572                                  * cache_vget() on the locked ncp to
 573                                  * validate the vp or set the cache entry
 574                                  * to unresolved.
 575                                  *
 576                                  * NOTE! vhold() is allowed if we hold a
 577                                  *       lock on the ncp (which we do).
 578                                  */
 579                                 if (ncp->nc_vp)
 580                                         vhold(ncp->nc_vp);
 581                                 atomic_clear_int(&ncp->nc_lockstatus,
 582                                                  NC_SHLOCK_VHOLD);
 583                                 crit_exit();
 584                                 break;
 585                         }
 586                         /* cmpset failed */
 587                         crit_exit();
 588                         continue;
 589                 }
 590
 591                 /*
 592                  * If already held shared we can just bump the count, but
 593                  * only allow this if nobody is trying to get the lock
 594                  * exclusively.
 595                  *
 596                  * VHOLD is a bit of a hack.  Even though we successfully
 597                  * added another shared ref, the cpu that got the first
 598                  * shared ref might not yet have held the vnode.
 599                  */
 600                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
 601                     NC_SHLOCK_FLAG) {
 602                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 603                                             NC_SHLOCK_REQ |
 604                                             NC_SHLOCK_FLAG)) > 0);
 605                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 606                                               count, count + 1)) {
 607                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 608                                         cpu_pause();
 609                                 break;
 610                         }
 611                         continue;
 612                 }
 613                 return(EWOULDBLOCK);
 614         }
 615         return(0);
 616 }
 617
 618 /*
 619  * Helper function
 620  *
 621  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
 622  *
 623  *       nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
 624  */
 625 static
 626 void
 627 _cache_unlock(struct namecache *ncp)
 628 {
 629         thread_t td __debugvar = curthread;
 630         u_int count;
 631         u_int ncount;
 632         struct vnode *dropvp;
 633
 634         KKASSERT(ncp->nc_refs >= 0);
 635         KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
 636         KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
 637
 638         count = ncp->nc_lockstatus;
 639         cpu_ccfence();
 640
 641         /*
 642          * Clear nc_locktd prior to the atomic op (excl lock only)
 643          */
 644         if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
 645                 ncp->nc_locktd = NULL;
 646         dropvp = NULL;
 647
 648         for (;;) {
 649                 if ((count &
 650                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
 651                         dropvp = ncp->nc_vp;
 652                         if (count & NC_EXLOCK_REQ)
 653                                 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
 654                         else
 655                                 ncount = 0;
 656
 657                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 658                                               count, ncount)) {
 659                                 if (count & NC_EXLOCK_REQ)
 660                                         wakeup(&ncp->nc_locktd);
 661                                 else if (count & NC_SHLOCK_REQ)
 662                                         wakeup(ncp);
 663                                 break;
 664                         }
 665                         dropvp = NULL;
 666                 } else {
 667                         KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
 668                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 669                                             NC_SHLOCK_REQ |
 670                                             NC_SHLOCK_FLAG)) > 1);
 671                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 672                                               count, count - 1)) {
 673                                 break;
 674                         }
 675                 }
 676                 count = ncp->nc_lockstatus;
 677                 cpu_ccfence();
 678         }
 679
 680         /*
 681          * Don't actually drop the vp until we successfully clean out
 682          * the lock, otherwise we may race another shared lock.
 683          */
 684         if (dropvp)
 685                 vdrop(dropvp);
 686 }
 687
 688 static
 689 int
 690 _cache_lockstatus(struct namecache *ncp)
 691 {
 692         if (ncp->nc_locktd == curthread)
 693                 return(LK_EXCLUSIVE);
 694         if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
 695                 return(LK_SHARED);
 696         return(-1);
 697 }
 698
 699 /*
 700  * cache_hold() and cache_drop() prevent the premature deletion of a
 701  * namecache entry but do not prevent operations (such as zapping) on
 702  * that namecache entry.
 703  *
 704  * This routine may only be called from outside this source module if
 705  * nc_refs is already at least 1.
 706  *
 707  * This is a rare case where callers are allowed to hold a spinlock,
 708  * so we can't ourselves.
 709  */
 710 static __inline
 711 struct namecache *
 712 _cache_hold(struct namecache *ncp)
 713 {
 714         atomic_add_int(&ncp->nc_refs, 1);
 715         return(ncp);
 716 }
 717
 718 /*
 719  * Drop a cache entry, taking care to deal with races.
 720  *
 721  * For potential 1->0 transitions we must hold the ncp lock to safely
 722  * test its flags.  An unresolved entry with no children must be zapped
 723  * to avoid leaks.
 724  *
 725  * The call to cache_zap() itself will handle all remaining races and
 726  * will decrement the ncp's refs regardless.  If we are resolved or
 727  * have children nc_refs can safely be dropped to 0 without having to
 728  * zap the entry.
 729  *
 730  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
 731  *
 732  * NOTE: cache_zap() may return a non-NULL referenced parent which must
 733  *       be dropped in a loop.
 734  */
 735 static __inline
 736 void
 737 _cache_drop(struct namecache *ncp)
 738 {
 739         int refs;
 740
 741         while (ncp) {
 742                 KKASSERT(ncp->nc_refs > 0);
 743                 refs = ncp->nc_refs;
 744
 745                 if (refs == 1) {
 746                         if (_cache_lock_nonblock(ncp) == 0) {
 747                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 748                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
 749                                     TAILQ_EMPTY(&ncp->nc_list)) {
 750                                         ncp = cache_zap(ncp, 1);
 751                                         continue;
 752                                 }
 753                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
 754                                         _cache_unlock(ncp);
 755                                         break;
 756                                 }
 757                                 _cache_unlock(ncp);
 758                         }
 759                 } else {
 760                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
 761                                 break;
 762                 }
 763                 cpu_pause();
 764         }
 765 }
 766
 767 /*
 768  * Link a new namecache entry to its parent and to the hash table.  Be
 769  * careful to avoid races if vhold() blocks in the future.
 770  *
 771  * Both ncp and par must be referenced and locked.
 772  *
 773  * NOTE: The hash table spinlock is held during this call, we can't do
 774  *       anything fancy.
 775  */
 776 static void
 777 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 778                    struct nchash_head *nchpp)
 779 {
 780         KKASSERT(ncp->nc_parent == NULL);
 781         ncp->nc_parent = par;
 782         ncp->nc_head = nchpp;
 783
 784         /*
 785          * Set inheritance flags.  Note that the parent flags may be
 786          * stale due to getattr potentially not having been run yet
 787          * (it gets run during nlookup()'s).
 788          */
 789         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 790         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 791                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 792         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 793                 ncp->nc_flag |= NCF_UF_PCACHE;
 794
 795         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 796
 797         if (TAILQ_EMPTY(&par->nc_list)) {
 798                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 799                 /*
 800                  * Any vp associated with an ncp which has children must
 801                  * be held to prevent it from being recycled.
 802                  */
 803                 if (par->nc_vp)
 804                         vhold(par->nc_vp);
 805         } else {
 806                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 807         }
 808 }
 809
 810 /*
 811  * Remove the parent and hash associations from a namecache structure.
 812  * If this is the last child of the parent the cache_drop(par) will
 813  * attempt to recursively zap the parent.
 814  *
 815  * ncp must be locked.  This routine will acquire a temporary lock on
 816  * the parent as wlel as the appropriate hash chain.
 817  */
 818 static void
 819 _cache_unlink_parent(struct namecache *ncp)
 820 {
 821         struct namecache *par;
 822         struct vnode *dropvp;
 823
 824         if ((par = ncp->nc_parent) != NULL) {
 825                 KKASSERT(ncp->nc_parent == par);
 826                 _cache_hold(par);
 827                 _cache_lock(par);
 828                 spin_lock(&ncp->nc_head->spin);
 829                 LIST_REMOVE(ncp, nc_hash);
 830                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 831                 dropvp = NULL;
 832                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 833                         dropvp = par->nc_vp;
 834                 spin_unlock(&ncp->nc_head->spin);
 835                 ncp->nc_parent = NULL;
 836                 ncp->nc_head = NULL;
 837                 _cache_unlock(par);
 838                 _cache_drop(par);
 839
 840                 /*
 841                  * We can only safely vdrop with no spinlocks held.
 842                  */
 843                 if (dropvp)
 844                         vdrop(dropvp);
 845         }
 846 }
 847
 848 /*
 849  * Allocate a new namecache structure.  Most of the code does not require
 850  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 851  */
 852 static struct namecache *
 853 cache_alloc(int nlen)
 854 {
 855         struct namecache *ncp;
 856
 857         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 858         if (nlen)
 859                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 860         ncp->nc_nlen = nlen;
 861         ncp->nc_flag = NCF_UNRESOLVED;
 862         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 863         ncp->nc_refs = 1;
 864
 865         TAILQ_INIT(&ncp->nc_list);
 866         _cache_lock(ncp);
 867         return(ncp);
 868 }
 869
 870 /*
 871  * Can only be called for the case where the ncp has never been
 872  * associated with anything (so no spinlocks are needed).
 873  */
 874 static void
 875 _cache_free(struct namecache *ncp)
 876 {
 877         KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
 878         if (ncp->nc_name)
 879                 kfree(ncp->nc_name, M_VFSCACHE);
 880         kfree(ncp, M_VFSCACHE);
 881 }
 882
 883 /*
 884  * [re]initialize a nchandle.
 885  */
 886 void
 887 cache_zero(struct nchandle *nch)
 888 {
 889         nch->ncp = NULL;
 890         nch->mount = NULL;
 891 }
 892
 893 /*
 894  * Ref and deref a namecache structure.
 895  *
 896  * The caller must specify a stable ncp pointer, typically meaning the
 897  * ncp is already referenced but this can also occur indirectly through
 898  * e.g. holding a lock on a direct child.
 899  *
 900  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 901  *          use read spinlocks here.
 902  *
 903  * MPSAFE if nch is
 904  */
 905 struct nchandle *
 906 cache_hold(struct nchandle *nch)
 907 {
 908         _cache_hold(nch->ncp);
 909         atomic_add_int(&nch->mount->mnt_refs, 1);
 910         return(nch);
 911 }
 912
 913 /*
 914  * Create a copy of a namecache handle for an already-referenced
 915  * entry.
 916  *
 917  * MPSAFE if nch is
 918  */
 919 void
 920 cache_copy(struct nchandle *nch, struct nchandle *target)
 921 {
 922         *target = *nch;
 923         if (target->ncp)
 924                 _cache_hold(target->ncp);
 925         atomic_add_int(&nch->mount->mnt_refs, 1);
 926 }
 927
 928 /*
 929  * MPSAFE if nch is
 930  */
 931 void
 932 cache_changemount(struct nchandle *nch, struct mount *mp)
 933 {
 934         atomic_add_int(&nch->mount->mnt_refs, -1);
 935         nch->mount = mp;
 936         atomic_add_int(&nch->mount->mnt_refs, 1);
 937 }
 938
 939 void
 940 cache_drop(struct nchandle *nch)
 941 {
 942         atomic_add_int(&nch->mount->mnt_refs, -1);
 943         _cache_drop(nch->ncp);
 944         nch->ncp = NULL;
 945         nch->mount = NULL;
 946 }
 947
 948 int
 949 cache_lockstatus(struct nchandle *nch)
 950 {
 951         return(_cache_lockstatus(nch->ncp));
 952 }
 953
 954 void
 955 cache_lock(struct nchandle *nch)
 956 {
 957         _cache_lock(nch->ncp);
 958 }
 959
 960 void
 961 cache_lock_maybe_shared(struct nchandle *nch, int excl)
 962 {
 963         struct namecache *ncp = nch->ncp;
 964
 965         if (ncp_shared_lock_disable || excl ||
 966             (ncp->nc_flag & NCF_UNRESOLVED)) {
 967                 _cache_lock(ncp);
 968         } else {
 969                 _cache_lock_shared(ncp);
 970                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 971                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
 972                                 _cache_unlock(ncp);
 973                                 _cache_lock(ncp);
 974                         }
 975                 } else {
 976                         _cache_unlock(ncp);
 977                         _cache_lock(ncp);
 978                 }
 979         }
 980 }
 981
 982 /*
 983  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
 984  * is responsible for checking both for validity on return as they
 985  * may have become invalid.
 986  *
 987  * We have to deal with potential deadlocks here, just ping pong
 988  * the lock until we get it (we will always block somewhere when
 989  * looping so this is not cpu-intensive).
 990  *
 991  * which = 0    nch1 not locked, nch2 is locked
 992  * which = 1    nch1 is locked, nch2 is not locked
 993  */
 994 void
 995 cache_relock(struct nchandle *nch1, struct ucred *cred1,
 996              struct nchandle *nch2, struct ucred *cred2)
 997 {
 998         int which;
 999
1000         which = 0;
1001
1002         for (;;) {
1003                 if (which == 0) {
1004                         if (cache_lock_nonblock(nch1) == 0) {
1005                                 cache_resolve(nch1, cred1);
1006                                 break;
1007                         }
1008                         cache_unlock(nch2);
1009                         cache_lock(nch1);
1010                         cache_resolve(nch1, cred1);
1011                         which = 1;
1012                 } else {
1013                         if (cache_lock_nonblock(nch2) == 0) {
1014                                 cache_resolve(nch2, cred2);
1015                                 break;
1016                         }
1017                         cache_unlock(nch1);
1018                         cache_lock(nch2);
1019                         cache_resolve(nch2, cred2);
1020                         which = 0;
1021                 }
1022         }
1023 }
1024
1025 int
1026 cache_lock_nonblock(struct nchandle *nch)
1027 {
1028         return(_cache_lock_nonblock(nch->ncp));
1029 }
1030
1031 void
1032 cache_unlock(struct nchandle *nch)
1033 {
1034         _cache_unlock(nch->ncp);
1035 }
1036
1037 /*
1038  * ref-and-lock, unlock-and-deref functions.
1039  *
1040  * This function is primarily used by nlookup.  Even though cache_lock
1041  * holds the vnode, it is possible that the vnode may have already
1042  * initiated a recyclement.
1043  *
1044  * We want cache_get() to return a definitively usable vnode or a
1045  * definitively unresolved ncp.
1046  */
1047 static
1048 struct namecache *
1049 _cache_get(struct namecache *ncp)
1050 {
1051         _cache_hold(ncp);
1052         _cache_lock(ncp);
1053         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1054                 _cache_setunresolved(ncp);
1055         return(ncp);
1056 }
1057
1058 /*
1059  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1060  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1061  * valid.  Otherwise an exclusive lock will be acquired instead.
1062  */
1063 static
1064 struct namecache *
1065 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1066 {
1067         if (ncp_shared_lock_disable || excl ||
1068             (ncp->nc_flag & NCF_UNRESOLVED)) {
1069                 return(_cache_get(ncp));
1070         }
1071         _cache_hold(ncp);
1072         _cache_lock_shared(ncp);
1073         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1074                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1075                         _cache_unlock(ncp);
1076                         ncp = _cache_get(ncp);
1077                         _cache_drop(ncp);
1078                 }
1079         } else {
1080                 _cache_unlock(ncp);
1081                 ncp = _cache_get(ncp);
1082                 _cache_drop(ncp);
1083         }
1084         return(ncp);
1085 }
1086
1087 /*
1088  * This is a special form of _cache_lock() which only succeeds if
1089  * it can get a pristine, non-recursive lock.  The caller must have
1090  * already ref'd the ncp.
1091  *
1092  * On success the ncp will be locked, on failure it will not.  The
1093  * ref count does not change either way.
1094  *
1095  * We want _cache_lock_special() (on success) to return a definitively
1096  * usable vnode or a definitively unresolved ncp.
1097  */
1098 static int
1099 _cache_lock_special(struct namecache *ncp)
1100 {
1101         if (_cache_lock_nonblock(ncp) == 0) {
1102                 if ((ncp->nc_lockstatus &
1103                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
1104                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1105                                 _cache_setunresolved(ncp);
1106                         return(0);
1107                 }
1108                 _cache_unlock(ncp);
1109         }
1110         return(EWOULDBLOCK);
1111 }
1112
1113 /*
1114  * This function tries to get a shared lock but will back-off to an exclusive
1115  * lock if:
1116  *
1117  * (1) Some other thread is trying to obtain an exclusive lock
1118  *     (to prevent the exclusive requester from getting livelocked out
1119  *     by many shared locks).
1120  *
1121  * (2) The current thread already owns an exclusive lock (to avoid
1122  *     deadlocking).
1123  *
1124  * WARNING! On machines with lots of cores we really want to try hard to
1125  *          get a shared lock or concurrent path lookups can chain-react
1126  *          into a very high-latency exclusive lock.
1127  */
1128 static int
1129 _cache_lock_shared_special(struct namecache *ncp)
1130 {
1131         /*
1132          * Only honor a successful shared lock (returning 0) if there is
1133          * no exclusive request pending and the vnode, if present, is not
1134          * in a reclaimed state.
1135          */
1136         if (_cache_lock_shared_nonblock(ncp) == 0) {
1137                 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) {
1138                         if (ncp->nc_vp == NULL ||
1139                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
1140                                 return(0);
1141                         }
1142                 }
1143                 _cache_unlock(ncp);
1144                 return(EWOULDBLOCK);
1145         }
1146
1147         /*
1148          * Non-blocking shared lock failed.  If we already own the exclusive
1149          * lock just acquire another exclusive lock (instead of deadlocking).
1150          * Otherwise acquire a shared lock.
1151          */
1152         if (ncp->nc_locktd == curthread) {
1153                 _cache_lock(ncp);
1154                 return(0);
1155         }
1156         _cache_lock_shared(ncp);
1157         return(0);
1158 }
1159
1160
1161 /*
1162  * NOTE: The same nchandle can be passed for both arguments.
1163  */
1164 void
1165 cache_get(struct nchandle *nch, struct nchandle *target)
1166 {
1167         KKASSERT(nch->ncp->nc_refs > 0);
1168         target->mount = nch->mount;
1169         target->ncp = _cache_get(nch->ncp);
1170         atomic_add_int(&target->mount->mnt_refs, 1);
1171 }
1172
1173 void
1174 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1175 {
1176         KKASSERT(nch->ncp->nc_refs > 0);
1177         target->mount = nch->mount;
1178         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1179         atomic_add_int(&target->mount->mnt_refs, 1);
1180 }
1181
1182 /*
1183  *
1184  */
1185 static __inline
1186 void
1187 _cache_put(struct namecache *ncp)
1188 {
1189         _cache_unlock(ncp);
1190         _cache_drop(ncp);
1191 }
1192
1193 /*
1194  *
1195  */
1196 void
1197 cache_put(struct nchandle *nch)
1198 {
1199         atomic_add_int(&nch->mount->mnt_refs, -1);
1200         _cache_put(nch->ncp);
1201         nch->ncp = NULL;
1202         nch->mount = NULL;
1203 }
1204
1205 /*
1206  * Resolve an unresolved ncp by associating a vnode with it.  If the
1207  * vnode is NULL, a negative cache entry is created.
1208  *
1209  * The ncp should be locked on entry and will remain locked on return.
1210  */
1211 static
1212 void
1213 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1214 {
1215         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
1216         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1217
1218         if (vp != NULL) {
1219                 /*
1220                  * Any vp associated with an ncp which has children must
1221                  * be held.  Any vp associated with a locked ncp must be held.
1222                  */
1223                 if (!TAILQ_EMPTY(&ncp->nc_list))
1224                         vhold(vp);
1225                 spin_lock(&vp->v_spin);
1226                 ncp->nc_vp = vp;
1227                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1228                 spin_unlock(&vp->v_spin);
1229                 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1230                         vhold(vp);
1231
1232                 /*
1233                  * Set auxiliary flags
1234                  */
1235                 switch(vp->v_type) {
1236                 case VDIR:
1237                         ncp->nc_flag |= NCF_ISDIR;
1238                         break;
1239                 case VLNK:
1240                         ncp->nc_flag |= NCF_ISSYMLINK;
1241                         /* XXX cache the contents of the symlink */
1242                         break;
1243                 default:
1244                         break;
1245                 }
1246                 atomic_add_int(&numcache, 1);
1247                 ncp->nc_error = 0;
1248                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
1249                  * implementation*/
1250                 if (mp != NULL)
1251                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1252                                 vp->v_pfsmp = mp;
1253         } else {
1254                 /*
1255                  * When creating a negative cache hit we set the
1256                  * namecache_gen.  A later resolve will clean out the
1257                  * negative cache hit if the mount point's namecache_gen
1258                  * has changed.  Used by devfs, could also be used by
1259                  * other remote FSs.
1260                  */
1261                 ncp->nc_vp = NULL;
1262                 spin_lock(&ncspin);
1263                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
1264                 ++numneg;
1265                 spin_unlock(&ncspin);
1266                 ncp->nc_error = ENOENT;
1267                 if (mp)
1268                         VFS_NCPGEN_SET(mp, ncp);
1269         }
1270         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1271 }
1272
1273 /*
1274  *
1275  */
1276 void
1277 cache_setvp(struct nchandle *nch, struct vnode *vp)
1278 {
1279         _cache_setvp(nch->mount, nch->ncp, vp);
1280 }
1281
1282 /*
1283  *
1284  */
1285 void
1286 cache_settimeout(struct nchandle *nch, int nticks)
1287 {
1288         struct namecache *ncp = nch->ncp;
1289
1290         if ((ncp->nc_timeout = ticks + nticks) == 0)
1291                 ncp->nc_timeout = 1;
1292 }
1293
1294 /*
1295  * Disassociate the vnode or negative-cache association and mark a
1296  * namecache entry as unresolved again.  Note that the ncp is still
1297  * left in the hash table and still linked to its parent.
1298  *
1299  * The ncp should be locked and refd on entry and will remain locked and refd
1300  * on return.
1301  *
1302  * This routine is normally never called on a directory containing children.
1303  * However, NFS often does just that in its rename() code as a cop-out to
1304  * avoid complex namespace operations.  This disconnects a directory vnode
1305  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1306  * sync.
1307  *
1308  */
1309 static
1310 void
1311 _cache_setunresolved(struct namecache *ncp)
1312 {
1313         struct vnode *vp;
1314
1315         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1316                 ncp->nc_flag |= NCF_UNRESOLVED;
1317                 ncp->nc_timeout = 0;
1318                 ncp->nc_error = ENOTCONN;
1319                 if ((vp = ncp->nc_vp) != NULL) {
1320                         atomic_add_int(&numcache, -1);
1321                         spin_lock(&vp->v_spin);
1322                         ncp->nc_vp = NULL;
1323                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1324                         spin_unlock(&vp->v_spin);
1325
1326                         /*
1327                          * Any vp associated with an ncp with children is
1328                          * held by that ncp.  Any vp associated with a locked
1329                          * ncp is held by that ncp.  These conditions must be
1330                          * undone when the vp is cleared out from the ncp.
1331                          */
1332                         if (!TAILQ_EMPTY(&ncp->nc_list))
1333                                 vdrop(vp);
1334                         if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1335                                 vdrop(vp);
1336                 } else {
1337                         spin_lock(&ncspin);
1338                         TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
1339                         --numneg;
1340                         spin_unlock(&ncspin);
1341                 }
1342                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1343         }
1344 }
1345
1346 /*
1347  * The cache_nresolve() code calls this function to automatically
1348  * set a resolved cache element to unresolved if it has timed out
1349  * or if it is a negative cache hit and the mount point namecache_gen
1350  * has changed.
1351  */
1352 static __inline int
1353 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1354 {
1355         /*
1356          * Try to zap entries that have timed out.  We have
1357          * to be careful here because locked leafs may depend
1358          * on the vnode remaining intact in a parent, so only
1359          * do this under very specific conditions.
1360          */
1361         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1362             TAILQ_EMPTY(&ncp->nc_list)) {
1363                 return 1;
1364         }
1365
1366         /*
1367          * If a resolved negative cache hit is invalid due to
1368          * the mount's namecache generation being bumped, zap it.
1369          */
1370         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1371                 return 1;
1372         }
1373
1374         /*
1375          * Otherwise we are good
1376          */
1377         return 0;
1378 }
1379
1380 static __inline void
1381 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1382 {
1383         /*
1384          * Already in an unresolved state, nothing to do.
1385          */
1386         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1387                 if (_cache_auto_unresolve_test(mp, ncp))
1388                         _cache_setunresolved(ncp);
1389         }
1390 }
1391
1392 /*
1393  *
1394  */
1395 void
1396 cache_setunresolved(struct nchandle *nch)
1397 {
1398         _cache_setunresolved(nch->ncp);
1399 }
1400
1401 /*
1402  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1403  * looking for matches.  This flag tells the lookup code when it must
1404  * check for a mount linkage and also prevents the directories in question
1405  * from being deleted or renamed.
1406  */
1407 static
1408 int
1409 cache_clrmountpt_callback(struct mount *mp, void *data)
1410 {
1411         struct nchandle *nch = data;
1412
1413         if (mp->mnt_ncmounton.ncp == nch->ncp)
1414                 return(1);
1415         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1416                 return(1);
1417         return(0);
1418 }
1419
1420 /*
1421  *
1422  */
1423 void
1424 cache_clrmountpt(struct nchandle *nch)
1425 {
1426         int count;
1427
1428         count = mountlist_scan(cache_clrmountpt_callback, nch,
1429                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1430         if (count == 0)
1431                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1432 }
1433
1434 /*
1435  * Invalidate portions of the namecache topology given a starting entry.
1436  * The passed ncp is set to an unresolved state and:
1437  *
1438  * The passed ncp must be referencxed and locked.  The routine may unlock
1439  * and relock ncp several times, and will recheck the children and loop
1440  * to catch races.  When done the passed ncp will be returned with the
1441  * reference and lock intact.
1442  *
1443  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1444  *                        that the physical underlying nodes have been
1445  *                        destroyed... as in deleted.  For example, when
1446  *                        a directory is removed.  This will cause record
1447  *                        lookups on the name to no longer be able to find
1448  *                        the record and tells the resolver to return failure
1449  *                        rather then trying to resolve through the parent.
1450  *
1451  *                        The topology itself, including ncp->nc_name,
1452  *                        remains intact.
1453  *
1454  *                        This only applies to the passed ncp, if CINV_CHILDREN
1455  *                        is specified the children are not flagged.
1456  *
1457  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1458  *                        state as well.
1459  *
1460  *                        Note that this will also have the side effect of
1461  *                        cleaning out any unreferenced nodes in the topology
1462  *                        from the leaves up as the recursion backs out.
1463  *
1464  * Note that the topology for any referenced nodes remains intact, but
1465  * the nodes will be marked as having been destroyed and will be set
1466  * to an unresolved state.
1467  *
1468  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1469  * the namecache entry may not actually be invalidated on return if it was
1470  * revalidated while recursing down into its children.  This code guarentees
1471  * that the node(s) will go through an invalidation cycle, but does not
1472  * guarentee that they will remain in an invalidated state.
1473  *
1474  * Returns non-zero if a revalidation was detected during the invalidation
1475  * recursion, zero otherwise.  Note that since only the original ncp is
1476  * locked the revalidation ultimately can only indicate that the original ncp
1477  * *MIGHT* no have been reresolved.
1478  *
1479  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1480  * have to avoid blowing out the kernel stack.  We do this by saving the
1481  * deep namecache node and aborting the recursion, then re-recursing at that
1482  * node using a depth-first algorithm in order to allow multiple deep
1483  * recursions to chain through each other, then we restart the invalidation
1484  * from scratch.
1485  */
1486
1487 struct cinvtrack {
1488         struct namecache *resume_ncp;
1489         int depth;
1490 };
1491
1492 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1493
1494 static
1495 int
1496 _cache_inval(struct namecache *ncp, int flags)
1497 {
1498         struct cinvtrack track;
1499         struct namecache *ncp2;
1500         int r;
1501
1502         track.depth = 0;
1503         track.resume_ncp = NULL;
1504
1505         for (;;) {
1506                 r = _cache_inval_internal(ncp, flags, &track);
1507                 if (track.resume_ncp == NULL)
1508                         break;
1509                 kprintf("Warning: deep namecache recursion at %s\n",
1510                         ncp->nc_name);
1511                 _cache_unlock(ncp);
1512                 while ((ncp2 = track.resume_ncp) != NULL) {
1513                         track.resume_ncp = NULL;
1514                         _cache_lock(ncp2);
1515                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1516                                              &track);
1517                         _cache_put(ncp2);
1518                 }
1519                 _cache_lock(ncp);
1520         }
1521         return(r);
1522 }
1523
1524 int
1525 cache_inval(struct nchandle *nch, int flags)
1526 {
1527         return(_cache_inval(nch->ncp, flags));
1528 }
1529
1530 /*
1531  * Helper for _cache_inval().  The passed ncp is refd and locked and
1532  * remains that way on return, but may be unlocked/relocked multiple
1533  * times by the routine.
1534  */
1535 static int
1536 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1537 {
1538         struct namecache *kid;
1539         struct namecache *nextkid;
1540         int rcnt = 0;
1541
1542         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1543
1544         _cache_setunresolved(ncp);
1545         if (flags & CINV_DESTROY) {
1546                 ncp->nc_flag |= NCF_DESTROYED;
1547                 ++ncp->nc_generation;
1548         }
1549         if ((flags & CINV_CHILDREN) &&
1550             (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1551         ) {
1552                 _cache_hold(kid);
1553                 if (++track->depth > MAX_RECURSION_DEPTH) {
1554                         track->resume_ncp = ncp;
1555                         _cache_hold(ncp);
1556                         ++rcnt;
1557                 }
1558                 _cache_unlock(ncp);
1559                 while (kid) {
1560                         if (track->resume_ncp) {
1561                                 _cache_drop(kid);
1562                                 break;
1563                         }
1564                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1565                                 _cache_hold(nextkid);
1566                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1567                             TAILQ_FIRST(&kid->nc_list)
1568                         ) {
1569                                 _cache_lock(kid);
1570                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1571                                 _cache_unlock(kid);
1572                         }
1573                         _cache_drop(kid);
1574                         kid = nextkid;
1575                 }
1576                 --track->depth;
1577                 _cache_lock(ncp);
1578         }
1579
1580         /*
1581          * Someone could have gotten in there while ncp was unlocked,
1582          * retry if so.
1583          */
1584         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1585                 ++rcnt;
1586         return (rcnt);
1587 }
1588
1589 /*
1590  * Invalidate a vnode's namecache associations.  To avoid races against
1591  * the resolver we do not invalidate a node which we previously invalidated
1592  * but which was then re-resolved while we were in the invalidation loop.
1593  *
1594  * Returns non-zero if any namecache entries remain after the invalidation
1595  * loop completed.
1596  *
1597  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1598  *       be ripped out of the topology while held, the vnode's v_namecache
1599  *       list has no such restriction.  NCP's can be ripped out of the list
1600  *       at virtually any time if not locked, even if held.
1601  *
1602  *       In addition, the v_namecache list itself must be locked via
1603  *       the vnode's spinlock.
1604  */
1605 int
1606 cache_inval_vp(struct vnode *vp, int flags)
1607 {
1608         struct namecache *ncp;
1609         struct namecache *next;
1610
1611 restart:
1612         spin_lock(&vp->v_spin);
1613         ncp = TAILQ_FIRST(&vp->v_namecache);
1614         if (ncp)
1615                 _cache_hold(ncp);
1616         while (ncp) {
1617                 /* loop entered with ncp held and vp spin-locked */
1618                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1619                         _cache_hold(next);
1620                 spin_unlock(&vp->v_spin);
1621                 _cache_lock(ncp);
1622                 if (ncp->nc_vp != vp) {
1623                         kprintf("Warning: cache_inval_vp: race-A detected on "
1624                                 "%s\n", ncp->nc_name);
1625                         _cache_put(ncp);
1626                         if (next)
1627                                 _cache_drop(next);
1628                         goto restart;
1629                 }
1630                 _cache_inval(ncp, flags);
1631                 _cache_put(ncp);                /* also releases reference */
1632                 ncp = next;
1633                 spin_lock(&vp->v_spin);
1634                 if (ncp && ncp->nc_vp != vp) {
1635                         spin_unlock(&vp->v_spin);
1636                         kprintf("Warning: cache_inval_vp: race-B detected on "
1637                                 "%s\n", ncp->nc_name);
1638                         _cache_drop(ncp);
1639                         goto restart;
1640                 }
1641         }
1642         spin_unlock(&vp->v_spin);
1643         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1644 }
1645
1646 /*
1647  * This routine is used instead of the normal cache_inval_vp() when we
1648  * are trying to recycle otherwise good vnodes.
1649  *
1650  * Return 0 on success, non-zero if not all namecache records could be
1651  * disassociated from the vnode (for various reasons).
1652  */
1653 int
1654 cache_inval_vp_nonblock(struct vnode *vp)
1655 {
1656         struct namecache *ncp;
1657         struct namecache *next;
1658
1659         spin_lock(&vp->v_spin);
1660         ncp = TAILQ_FIRST(&vp->v_namecache);
1661         if (ncp)
1662                 _cache_hold(ncp);
1663         while (ncp) {
1664                 /* loop entered with ncp held */
1665                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1666                         _cache_hold(next);
1667                 spin_unlock(&vp->v_spin);
1668                 if (_cache_lock_nonblock(ncp)) {
1669                         _cache_drop(ncp);
1670                         if (next)
1671                                 _cache_drop(next);
1672                         goto done;
1673                 }
1674                 if (ncp->nc_vp != vp) {
1675                         kprintf("Warning: cache_inval_vp: race-A detected on "
1676                                 "%s\n", ncp->nc_name);
1677                         _cache_put(ncp);
1678                         if (next)
1679                                 _cache_drop(next);
1680                         goto done;
1681                 }
1682                 _cache_inval(ncp, 0);
1683                 _cache_put(ncp);                /* also releases reference */
1684                 ncp = next;
1685                 spin_lock(&vp->v_spin);
1686                 if (ncp && ncp->nc_vp != vp) {
1687                         spin_unlock(&vp->v_spin);
1688                         kprintf("Warning: cache_inval_vp: race-B detected on "
1689                                 "%s\n", ncp->nc_name);
1690                         _cache_drop(ncp);
1691                         goto done;
1692                 }
1693         }
1694         spin_unlock(&vp->v_spin);
1695 done:
1696         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1697 }
1698
1699 /*
1700  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1701  * must be locked.  The target ncp is destroyed (as a normal rename-over
1702  * would destroy the target file or directory).
1703  *
1704  * Because there may be references to the source ncp we cannot copy its
1705  * contents to the target.  Instead the source ncp is relinked as the target
1706  * and the target ncp is removed from the namecache topology.
1707  */
1708 void
1709 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1710 {
1711         struct namecache *fncp = fnch->ncp;
1712         struct namecache *tncp = tnch->ncp;
1713         struct namecache *tncp_par;
1714         struct nchash_head *nchpp;
1715         u_int32_t hash;
1716         char *oname;
1717         char *nname;
1718
1719         ++fncp->nc_generation;
1720         ++tncp->nc_generation;
1721         if (tncp->nc_nlen) {
1722                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1723                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1724                 nname[tncp->nc_nlen] = 0;
1725         } else {
1726                 nname = NULL;
1727         }
1728
1729         /*
1730          * Rename fncp (unlink)
1731          */
1732         _cache_unlink_parent(fncp);
1733         oname = fncp->nc_name;
1734         fncp->nc_name = nname;
1735         fncp->nc_nlen = tncp->nc_nlen;
1736         if (oname)
1737                 kfree(oname, M_VFSCACHE);
1738
1739         tncp_par = tncp->nc_parent;
1740         _cache_hold(tncp_par);
1741         _cache_lock(tncp_par);
1742
1743         /*
1744          * Rename fncp (relink)
1745          */
1746         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1747         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1748         nchpp = NCHHASH(hash);
1749
1750         spin_lock(&nchpp->spin);
1751         _cache_link_parent(fncp, tncp_par, nchpp);
1752         spin_unlock(&nchpp->spin);
1753
1754         _cache_put(tncp_par);
1755
1756         /*
1757          * Get rid of the overwritten tncp (unlink)
1758          */
1759         _cache_unlink(tncp);
1760 }
1761
1762 /*
1763  * Perform actions consistent with unlinking a file.  The passed-in ncp
1764  * must be locked.
1765  *
1766  * The ncp is marked DESTROYED so it no longer shows up in searches,
1767  * and will be physically deleted when the vnode goes away.
1768  *
1769  * If the related vnode has no refs then we cycle it through vget()/vput()
1770  * to (possibly if we don't have a ref race) trigger a deactivation,
1771  * allowing the VFS to trivially detect and recycle the deleted vnode
1772  * via VOP_INACTIVE().
1773  *
1774  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1775  *       target ncp.
1776  */
1777 void
1778 cache_unlink(struct nchandle *nch)
1779 {
1780         _cache_unlink(nch->ncp);
1781 }
1782
1783 static void
1784 _cache_unlink(struct namecache *ncp)
1785 {
1786         struct vnode *vp;
1787
1788         /*
1789          * Causes lookups to fail and allows another ncp with the same
1790          * name to be created under ncp->nc_parent.
1791          */
1792         ncp->nc_flag |= NCF_DESTROYED;
1793         ++ncp->nc_generation;
1794
1795         /*
1796          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
1797          * force action on the 1->0 transition.
1798          */
1799         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1800             (vp = ncp->nc_vp) != NULL) {
1801                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
1802                 if (VREFCNT(vp) <= 0) {
1803                         if (vget(vp, LK_SHARED) == 0)
1804                                 vput(vp);
1805                 }
1806         }
1807 }
1808
1809 /*
1810  * Return non-zero if the nch might be associated with an open and/or mmap()'d
1811  * file.  The easy solution is to just return non-zero if the vnode has refs.
1812  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1813  * force the reclaim).
1814  */
1815 int
1816 cache_isopen(struct nchandle *nch)
1817 {
1818         struct vnode *vp;
1819         struct namecache *ncp = nch->ncp;
1820
1821         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1822             (vp = ncp->nc_vp) != NULL &&
1823             VREFCNT(vp)) {
1824                 return 1;
1825         }
1826         return 0;
1827 }
1828
1829
1830 /*
1831  * vget the vnode associated with the namecache entry.  Resolve the namecache
1832  * entry if necessary.  The passed ncp must be referenced and locked.  If
1833  * the ncp is resolved it might be locked shared.
1834  *
1835  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
1836  * (depending on the passed lk_type) will be returned in *vpp with an error
1837  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
1838  * most typical error is ENOENT, meaning that the ncp represents a negative
1839  * cache hit and there is no vnode to retrieve, but other errors can occur
1840  * too.
1841  *
1842  * The vget() can race a reclaim.  If this occurs we re-resolve the
1843  * namecache entry.
1844  *
1845  * There are numerous places in the kernel where vget() is called on a
1846  * vnode while one or more of its namecache entries is locked.  Releasing
1847  * a vnode never deadlocks against locked namecache entries (the vnode
1848  * will not get recycled while referenced ncp's exist).  This means we
1849  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
1850  * lock when acquiring the vp lock or we might cause a deadlock.
1851  *
1852  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1853  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1854  *       relocked exclusively before being re-resolved.
1855  */
1856 int
1857 cache_vget(struct nchandle *nch, struct ucred *cred,
1858            int lk_type, struct vnode **vpp)
1859 {
1860         struct namecache *ncp;
1861         struct vnode *vp;
1862         int error;
1863
1864         ncp = nch->ncp;
1865 again:
1866         vp = NULL;
1867         if (ncp->nc_flag & NCF_UNRESOLVED)
1868                 error = cache_resolve(nch, cred);
1869         else
1870                 error = 0;
1871
1872         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1873                 error = vget(vp, lk_type);
1874                 if (error) {
1875                         /*
1876                          * VRECLAIM race
1877                          *
1878                          * The ncp may have been locked shared, we must relock
1879                          * it exclusively before we can set it to unresolved.
1880                          */
1881                         if (error == ENOENT) {
1882                                 kprintf("Warning: vnode reclaim race detected "
1883                                         "in cache_vget on %p (%s)\n",
1884                                         vp, ncp->nc_name);
1885                                 _cache_unlock(ncp);
1886                                 _cache_lock(ncp);
1887                                 _cache_setunresolved(ncp);
1888                                 goto again;
1889                         }
1890
1891                         /*
1892                          * Not a reclaim race, some other error.
1893                          */
1894                         KKASSERT(ncp->nc_vp == vp);
1895                         vp = NULL;
1896                 } else {
1897                         KKASSERT(ncp->nc_vp == vp);
1898                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1899                 }
1900         }
1901         if (error == 0 && vp == NULL)
1902                 error = ENOENT;
1903         *vpp = vp;
1904         return(error);
1905 }
1906
1907 /*
1908  * Similar to cache_vget() but only acquires a ref on the vnode.
1909  *
1910  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1911  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1912  *       relocked exclusively before being re-resolved.
1913  */
1914 int
1915 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1916 {
1917         struct namecache *ncp;
1918         struct vnode *vp;
1919         int error;
1920
1921         ncp = nch->ncp;
1922 again:
1923         vp = NULL;
1924         if (ncp->nc_flag & NCF_UNRESOLVED)
1925                 error = cache_resolve(nch, cred);
1926         else
1927                 error = 0;
1928
1929         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1930                 error = vget(vp, LK_SHARED);
1931                 if (error) {
1932                         /*
1933                          * VRECLAIM race
1934                          */
1935                         if (error == ENOENT) {
1936                                 kprintf("Warning: vnode reclaim race detected "
1937                                         "in cache_vget on %p (%s)\n",
1938                                         vp, ncp->nc_name);
1939                                 _cache_unlock(ncp);
1940                                 _cache_lock(ncp);
1941                                 _cache_setunresolved(ncp);
1942                                 goto again;
1943                         }
1944
1945                         /*
1946                          * Not a reclaim race, some other error.
1947                          */
1948                         KKASSERT(ncp->nc_vp == vp);
1949                         vp = NULL;
1950                 } else {
1951                         KKASSERT(ncp->nc_vp == vp);
1952                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1953                         /* caller does not want a lock */
1954                         vn_unlock(vp);
1955                 }
1956         }
1957         if (error == 0 && vp == NULL)
1958                 error = ENOENT;
1959         *vpp = vp;
1960         return(error);
1961 }
1962
1963 /*
1964  * Return a referenced vnode representing the parent directory of
1965  * ncp.
1966  *
1967  * Because the caller has locked the ncp it should not be possible for
1968  * the parent ncp to go away.  However, the parent can unresolve its
1969  * dvp at any time so we must be able to acquire a lock on the parent
1970  * to safely access nc_vp.
1971  *
1972  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
1973  * so use vhold()/vdrop() while holding the lock to prevent dvp from
1974  * getting destroyed.
1975  *
1976  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
1977  *       lock on the ncp in question..
1978  */
1979 static struct vnode *
1980 cache_dvpref(struct namecache *ncp)
1981 {
1982         struct namecache *par;
1983         struct vnode *dvp;
1984
1985         dvp = NULL;
1986         if ((par = ncp->nc_parent) != NULL) {
1987                 _cache_hold(par);
1988                 _cache_lock(par);
1989                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
1990                         if ((dvp = par->nc_vp) != NULL)
1991                                 vhold(dvp);
1992                 }
1993                 _cache_unlock(par);
1994                 if (dvp) {
1995                         if (vget(dvp, LK_SHARED) == 0) {
1996                                 vn_unlock(dvp);
1997                                 vdrop(dvp);
1998                                 /* return refd, unlocked dvp */
1999                         } else {
2000                                 vdrop(dvp);
2001                                 dvp = NULL;
2002                         }
2003                 }
2004                 _cache_drop(par);
2005         }
2006         return(dvp);
2007 }
2008
2009 /*
2010  * Convert a directory vnode to a namecache record without any other
2011  * knowledge of the topology.  This ONLY works with directory vnodes and
2012  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2013  * returned ncp (if not NULL) will be held and unlocked.
2014  *
2015  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2016  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2017  * for dvp.  This will fail only if the directory has been deleted out from
2018  * under the caller.
2019  *
2020  * Callers must always check for a NULL return no matter the value of 'makeit'.
2021  *
2022  * To avoid underflowing the kernel stack each recursive call increments
2023  * the makeit variable.
2024  */
2025
2026 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2027                                   struct vnode *dvp, char *fakename);
2028 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2029                                   struct vnode **saved_dvp);
2030
2031 int
2032 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2033               struct nchandle *nch)
2034 {
2035         struct vnode *saved_dvp;
2036         struct vnode *pvp;
2037         char *fakename;
2038         int error;
2039
2040         nch->ncp = NULL;
2041         nch->mount = dvp->v_mount;
2042         saved_dvp = NULL;
2043         fakename = NULL;
2044
2045         /*
2046          * Handle the makeit == 0 degenerate case
2047          */
2048         if (makeit == 0) {
2049                 spin_lock_shared(&dvp->v_spin);
2050                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2051                 if (nch->ncp)
2052                         cache_hold(nch);
2053                 spin_unlock_shared(&dvp->v_spin);
2054         }
2055
2056         /*
2057          * Loop until resolution, inside code will break out on error.
2058          */
2059         while (makeit) {
2060                 /*
2061                  * Break out if we successfully acquire a working ncp.
2062                  */
2063                 spin_lock_shared(&dvp->v_spin);
2064                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2065                 if (nch->ncp) {
2066                         cache_hold(nch);
2067                         spin_unlock_shared(&dvp->v_spin);
2068                         break;
2069                 }
2070                 spin_unlock_shared(&dvp->v_spin);
2071
2072                 /*
2073                  * If dvp is the root of its filesystem it should already
2074                  * have a namecache pointer associated with it as a side
2075                  * effect of the mount, but it may have been disassociated.
2076                  */
2077                 if (dvp->v_flag & VROOT) {
2078                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2079                         error = cache_resolve_mp(nch->mount);
2080                         _cache_put(nch->ncp);
2081                         if (ncvp_debug) {
2082                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2083                                         dvp->v_mount, error);
2084                         }
2085                         if (error) {
2086                                 if (ncvp_debug)
2087                                         kprintf(" failed\n");
2088                                 nch->ncp = NULL;
2089                                 break;
2090                         }
2091                         if (ncvp_debug)
2092                                 kprintf(" succeeded\n");
2093                         continue;
2094                 }
2095
2096                 /*
2097                  * If we are recursed too deeply resort to an O(n^2)
2098                  * algorithm to resolve the namecache topology.  The
2099                  * resolved pvp is left referenced in saved_dvp to
2100                  * prevent the tree from being destroyed while we loop.
2101                  */
2102                 if (makeit > 20) {
2103                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2104                         if (error) {
2105                                 kprintf("lookupdotdot(longpath) failed %d "
2106                                        "dvp %p\n", error, dvp);
2107                                 nch->ncp = NULL;
2108                                 break;
2109                         }
2110                         continue;
2111                 }
2112
2113                 /*
2114                  * Get the parent directory and resolve its ncp.
2115                  */
2116                 if (fakename) {
2117                         kfree(fakename, M_TEMP);
2118                         fakename = NULL;
2119                 }
2120                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2121                                           &fakename);
2122                 if (error) {
2123                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2124                         break;
2125                 }
2126                 vn_unlock(pvp);
2127
2128                 /*
2129                  * Reuse makeit as a recursion depth counter.  On success
2130                  * nch will be fully referenced.
2131                  */
2132                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2133                 vrele(pvp);
2134                 if (nch->ncp == NULL)
2135                         break;
2136
2137                 /*
2138                  * Do an inefficient scan of pvp (embodied by ncp) to look
2139                  * for dvp.  This will create a namecache record for dvp on
2140                  * success.  We loop up to recheck on success.
2141                  *
2142                  * ncp and dvp are both held but not locked.
2143                  */
2144                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2145                 if (error) {
2146                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2147                                 pvp, nch->ncp->nc_name, dvp);
2148                         cache_drop(nch);
2149                         /* nch was NULLed out, reload mount */
2150                         nch->mount = dvp->v_mount;
2151                         break;
2152                 }
2153                 if (ncvp_debug) {
2154                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2155                                 pvp, nch->ncp->nc_name);
2156                 }
2157                 cache_drop(nch);
2158                 /* nch was NULLed out, reload mount */
2159                 nch->mount = dvp->v_mount;
2160         }
2161
2162         /*
2163          * If nch->ncp is non-NULL it will have been held already.
2164          */
2165         if (fakename)
2166                 kfree(fakename, M_TEMP);
2167         if (saved_dvp)
2168                 vrele(saved_dvp);
2169         if (nch->ncp)
2170                 return (0);
2171         return (EINVAL);
2172 }
2173
2174 /*
2175  * Go up the chain of parent directories until we find something
2176  * we can resolve into the namecache.  This is very inefficient.
2177  */
2178 static
2179 int
2180 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2181                   struct vnode **saved_dvp)
2182 {
2183         struct nchandle nch;
2184         struct vnode *pvp;
2185         int error;
2186         static time_t last_fromdvp_report;
2187         char *fakename;
2188
2189         /*
2190          * Loop getting the parent directory vnode until we get something we
2191          * can resolve in the namecache.
2192          */
2193         vref(dvp);
2194         nch.mount = dvp->v_mount;
2195         nch.ncp = NULL;
2196         fakename = NULL;
2197
2198         for (;;) {
2199                 if (fakename) {
2200                         kfree(fakename, M_TEMP);
2201                         fakename = NULL;
2202                 }
2203                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2204                                           &fakename);
2205                 if (error) {
2206                         vrele(dvp);
2207                         break;
2208                 }
2209                 vn_unlock(pvp);
2210                 spin_lock_shared(&pvp->v_spin);
2211                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2212                         _cache_hold(nch.ncp);
2213                         spin_unlock_shared(&pvp->v_spin);
2214                         vrele(pvp);
2215                         break;
2216                 }
2217                 spin_unlock_shared(&pvp->v_spin);
2218                 if (pvp->v_flag & VROOT) {
2219                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2220                         error = cache_resolve_mp(nch.mount);
2221                         _cache_unlock(nch.ncp);
2222                         vrele(pvp);
2223                         if (error) {
2224                                 _cache_drop(nch.ncp);
2225                                 nch.ncp = NULL;
2226                                 vrele(dvp);
2227                         }
2228                         break;
2229                 }
2230                 vrele(dvp);
2231                 dvp = pvp;
2232         }
2233         if (error == 0) {
2234                 if (last_fromdvp_report != time_uptime) {
2235                         last_fromdvp_report = time_uptime;
2236                         kprintf("Warning: extremely inefficient path "
2237                                 "resolution on %s\n",
2238                                 nch.ncp->nc_name);
2239                 }
2240                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2241
2242                 /*
2243                  * Hopefully dvp now has a namecache record associated with
2244                  * it.  Leave it referenced to prevent the kernel from
2245                  * recycling the vnode.  Otherwise extremely long directory
2246                  * paths could result in endless recycling.
2247                  */
2248                 if (*saved_dvp)
2249                     vrele(*saved_dvp);
2250                 *saved_dvp = dvp;
2251                 _cache_drop(nch.ncp);
2252         }
2253         if (fakename)
2254                 kfree(fakename, M_TEMP);
2255         return (error);
2256 }
2257
2258 /*
2259  * Do an inefficient scan of the directory represented by ncp looking for
2260  * the directory vnode dvp.  ncp must be held but not locked on entry and
2261  * will be held on return.  dvp must be refd but not locked on entry and
2262  * will remain refd on return.
2263  *
2264  * Why do this at all?  Well, due to its stateless nature the NFS server
2265  * converts file handles directly to vnodes without necessarily going through
2266  * the namecache ops that would otherwise create the namecache topology
2267  * leading to the vnode.  We could either (1) Change the namecache algorithms
2268  * to allow disconnect namecache records that are re-merged opportunistically,
2269  * or (2) Make the NFS server backtrack and scan to recover a connected
2270  * namecache topology in order to then be able to issue new API lookups.
2271  *
2272  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2273  * namecache algorithms and introduces a lot of complication in every subsystem
2274  * that calls into the namecache to deal with the re-merge case, especially
2275  * since we are using the namecache to placehold negative lookups and the
2276  * vnode might not be immediately assigned. (2) is certainly far less
2277  * efficient then (1), but since we are only talking about directories here
2278  * (which are likely to remain cached), the case does not actually run all
2279  * that often and has the supreme advantage of not polluting the namecache
2280  * algorithms.
2281  *
2282  * If a fakename is supplied just construct a namecache entry using the
2283  * fake name.
2284  */
2285 static int
2286 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2287                        struct vnode *dvp, char *fakename)
2288 {
2289         struct nlcomponent nlc;
2290         struct nchandle rncp;
2291         struct dirent *den;
2292         struct vnode *pvp;
2293         struct vattr vat;
2294         struct iovec iov;
2295         struct uio uio;
2296         int blksize;
2297         int eofflag;
2298         int bytes;
2299         char *rbuf;
2300         int error;
2301
2302         vat.va_blocksize = 0;
2303         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2304                 return (error);
2305         cache_lock(nch);
2306         error = cache_vref(nch, cred, &pvp);
2307         cache_unlock(nch);
2308         if (error)
2309                 return (error);
2310         if (ncvp_debug) {
2311                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2312                         "vattr fileid = %lld\n",
2313                         nch->ncp, nch->ncp->nc_name,
2314                         vat.va_blocksize,
2315                         (long long)vat.va_fileid);
2316         }
2317
2318         /*
2319          * Use the supplied fakename if not NULL.  Fake names are typically
2320          * not in the actual filesystem hierarchy.  This is used by HAMMER
2321          * to glue @@timestamp recursions together.
2322          */
2323         if (fakename) {
2324                 nlc.nlc_nameptr = fakename;
2325                 nlc.nlc_namelen = strlen(fakename);
2326                 rncp = cache_nlookup(nch, &nlc);
2327                 goto done;
2328         }
2329
2330         if ((blksize = vat.va_blocksize) == 0)
2331                 blksize = DEV_BSIZE;
2332         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2333         rncp.ncp = NULL;
2334
2335         eofflag = 0;
2336         uio.uio_offset = 0;
2337 again:
2338         iov.iov_base = rbuf;
2339         iov.iov_len = blksize;
2340         uio.uio_iov = &iov;
2341         uio.uio_iovcnt = 1;
2342         uio.uio_resid = blksize;
2343         uio.uio_segflg = UIO_SYSSPACE;
2344         uio.uio_rw = UIO_READ;
2345         uio.uio_td = curthread;
2346
2347         if (ncvp_debug >= 2)
2348                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2349         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2350         if (error == 0) {
2351                 den = (struct dirent *)rbuf;
2352                 bytes = blksize - uio.uio_resid;
2353
2354                 while (bytes > 0) {
2355                         if (ncvp_debug >= 2) {
2356                                 kprintf("cache_inefficient_scan: %*.*s\n",
2357                                         den->d_namlen, den->d_namlen,
2358                                         den->d_name);
2359                         }
2360                         if (den->d_type != DT_WHT &&
2361                             den->d_ino == vat.va_fileid) {
2362                                 if (ncvp_debug) {
2363                                         kprintf("cache_inefficient_scan: "
2364                                                "MATCHED inode %lld path %s/%*.*s\n",
2365                                                (long long)vat.va_fileid,
2366                                                nch->ncp->nc_name,
2367                                                den->d_namlen, den->d_namlen,
2368                                                den->d_name);
2369                                 }
2370                                 nlc.nlc_nameptr = den->d_name;
2371                                 nlc.nlc_namelen = den->d_namlen;
2372                                 rncp = cache_nlookup(nch, &nlc);
2373                                 KKASSERT(rncp.ncp != NULL);
2374                                 break;
2375                         }
2376                         bytes -= _DIRENT_DIRSIZ(den);
2377                         den = _DIRENT_NEXT(den);
2378                 }
2379                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2380                         goto again;
2381         }
2382         kfree(rbuf, M_TEMP);
2383 done:
2384         vrele(pvp);
2385         if (rncp.ncp) {
2386                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2387                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2388                         if (ncvp_debug >= 2) {
2389                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2390                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2391                         }
2392                 } else {
2393                         if (ncvp_debug >= 2) {
2394                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2395                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2396                                         rncp.ncp->nc_vp);
2397                         }
2398                 }
2399                 if (rncp.ncp->nc_vp == NULL)
2400                         error = rncp.ncp->nc_error;
2401                 /*
2402                  * Release rncp after a successful nlookup.  rncp was fully
2403                  * referenced.
2404                  */
2405                 cache_put(&rncp);
2406         } else {
2407                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2408                         dvp, nch->ncp->nc_name);
2409                 error = ENOENT;
2410         }
2411         return (error);
2412 }
2413
2414 /*
2415  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2416  * state, which disassociates it from its vnode or ncneglist.
2417  *
2418  * Then, if there are no additional references to the ncp and no children,
2419  * the ncp is removed from the topology and destroyed.
2420  *
2421  * References and/or children may exist if the ncp is in the middle of the
2422  * topology, preventing the ncp from being destroyed.
2423  *
2424  * This function must be called with the ncp held and locked and will unlock
2425  * and drop it during zapping.
2426  *
2427  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2428  * This case can occur in the cache_drop() path.
2429  *
2430  * This function may returned a held (but NOT locked) parent node which the
2431  * caller must drop.  We do this so _cache_drop() can loop, to avoid
2432  * blowing out the kernel stack.
2433  *
2434  * WARNING!  For MPSAFE operation this routine must acquire up to three
2435  *           spin locks to be able to safely test nc_refs.  Lock order is
2436  *           very important.
2437  *
2438  *           hash spinlock if on hash list
2439  *           parent spinlock if child of parent
2440  *           (the ncp is unresolved so there is no vnode association)
2441  */
2442 static struct namecache *
2443 cache_zap(struct namecache *ncp, int nonblock)
2444 {
2445         struct namecache *par;
2446         struct vnode *dropvp;
2447         int refs;
2448
2449         /*
2450          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2451          */
2452         _cache_setunresolved(ncp);
2453
2454         /*
2455          * Try to scrap the entry and possibly tail-recurse on its parent.
2456          * We only scrap unref'd (other then our ref) unresolved entries,
2457          * we do not scrap 'live' entries.
2458          *
2459          * Note that once the spinlocks are acquired if nc_refs == 1 no
2460          * other references are possible.  If it isn't, however, we have
2461          * to decrement but also be sure to avoid a 1->0 transition.
2462          */
2463         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2464         KKASSERT(ncp->nc_refs > 0);
2465
2466         /*
2467          * Acquire locks.  Note that the parent can't go away while we hold
2468          * a child locked.
2469          */
2470         if ((par = ncp->nc_parent) != NULL) {
2471                 if (nonblock) {
2472                         for (;;) {
2473                                 if (_cache_lock_nonblock(par) == 0)
2474                                         break;
2475                                 refs = ncp->nc_refs;
2476                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2477                                 ++numdefered;   /* MP race ok */
2478                                 if (atomic_cmpset_int(&ncp->nc_refs,
2479                                                       refs, refs - 1)) {
2480                                         _cache_unlock(ncp);
2481                                         return(NULL);
2482                                 }
2483                                 cpu_pause();
2484                         }
2485                         _cache_hold(par);
2486                 } else {
2487                         _cache_hold(par);
2488                         _cache_lock(par);
2489                 }
2490                 spin_lock(&ncp->nc_head->spin);
2491         }
2492
2493         /*
2494          * If someone other then us has a ref or we have children
2495          * we cannot zap the entry.  The 1->0 transition and any
2496          * further list operation is protected by the spinlocks
2497          * we have acquired but other transitions are not.
2498          */
2499         for (;;) {
2500                 refs = ncp->nc_refs;
2501                 cpu_ccfence();
2502                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2503                         break;
2504                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2505                         if (par) {
2506                                 spin_unlock(&ncp->nc_head->spin);
2507                                 _cache_put(par);
2508                         }
2509                         _cache_unlock(ncp);
2510                         return(NULL);
2511                 }
2512                 cpu_pause();
2513         }
2514
2515         /*
2516          * We are the only ref and with the spinlocks held no further
2517          * refs can be acquired by others.
2518          *
2519          * Remove us from the hash list and parent list.  We have to
2520          * drop a ref on the parent's vp if the parent's list becomes
2521          * empty.
2522          */
2523         dropvp = NULL;
2524         if (par) {
2525                 struct nchash_head *nchpp = ncp->nc_head;
2526
2527                 KKASSERT(nchpp != NULL);
2528                 LIST_REMOVE(ncp, nc_hash);
2529                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2530                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
2531                         dropvp = par->nc_vp;
2532                 ncp->nc_head = NULL;
2533                 ncp->nc_parent = NULL;
2534                 spin_unlock(&nchpp->spin);
2535                 _cache_unlock(par);
2536         } else {
2537                 KKASSERT(ncp->nc_head == NULL);
2538         }
2539
2540         /*
2541          * ncp should not have picked up any refs.  Physically
2542          * destroy the ncp.
2543          */
2544         if (ncp->nc_refs != 1) {
2545                 int save_refs = ncp->nc_refs;
2546                 cpu_ccfence();
2547                 panic("cache_zap: %p bad refs %d (%d)\n",
2548                         ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0));
2549         }
2550         KKASSERT(ncp->nc_refs == 1);
2551         /* _cache_unlock(ncp) not required */
2552         ncp->nc_refs = -1;      /* safety */
2553         if (ncp->nc_name)
2554                 kfree(ncp->nc_name, M_VFSCACHE);
2555         kfree(ncp, M_VFSCACHE);
2556
2557         /*
2558          * Delayed drop (we had to release our spinlocks)
2559          *
2560          * The refed parent (if not  NULL) must be dropped.  The
2561          * caller is responsible for looping.
2562          */
2563         if (dropvp)
2564                 vdrop(dropvp);
2565         return(par);
2566 }
2567
2568 /*
2569  * Clean up dangling negative cache and defered-drop entries in the
2570  * namecache.
2571  *
2572  * This routine is called in the critical path and also called from
2573  * vnlru().  When called from vnlru we use a lower limit to try to
2574  * deal with the negative cache before the critical path has to start
2575  * dealing with it.
2576  */
2577 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2578
2579 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2580 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2581
2582 void
2583 cache_hysteresis(int critpath)
2584 {
2585         int poslimit;
2586         int neglimit = desiredvnodes / ncnegfactor;
2587         int xnumcache = numcache;
2588
2589         if (critpath == 0)
2590                 neglimit = neglimit * 8 / 10;
2591
2592         /*
2593          * Don't cache too many negative hits.  We use hysteresis to reduce
2594          * the impact on the critical path.
2595          */
2596         switch(neg_cache_hysteresis_state[critpath]) {
2597         case CHI_LOW:
2598                 if (numneg > MINNEG && numneg > neglimit) {
2599                         if (critpath)
2600                                 _cache_cleanneg(ncnegflush);
2601                         else
2602                                 _cache_cleanneg(ncnegflush +
2603                                                 numneg - neglimit);
2604                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2605                 }
2606                 break;
2607         case CHI_HIGH:
2608                 if (numneg > MINNEG * 9 / 10 &&
2609                     numneg * 9 / 10 > neglimit
2610                 ) {
2611                         if (critpath)
2612                                 _cache_cleanneg(ncnegflush);
2613                         else
2614                                 _cache_cleanneg(ncnegflush +
2615                                                 numneg * 9 / 10 - neglimit);
2616                 } else {
2617                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
2618                 }
2619                 break;
2620         }
2621
2622         /*
2623          * Don't cache too many positive hits.  We use hysteresis to reduce
2624          * the impact on the critical path.
2625          *
2626          * Excessive positive hits can accumulate due to large numbers of
2627          * hardlinks (the vnode cache will not prevent hl ncps from growing
2628          * into infinity).
2629          */
2630         if ((poslimit = ncposlimit) == 0)
2631                 poslimit = desiredvnodes * 2;
2632         if (critpath == 0)
2633                 poslimit = poslimit * 8 / 10;
2634
2635         switch(pos_cache_hysteresis_state[critpath]) {
2636         case CHI_LOW:
2637                 if (xnumcache > poslimit && xnumcache > MINPOS) {
2638                         if (critpath)
2639                                 _cache_cleanpos(ncposflush);
2640                         else
2641                                 _cache_cleanpos(ncposflush +
2642                                                 xnumcache - poslimit);
2643                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2644                 }
2645                 break;
2646         case CHI_HIGH:
2647                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2648                         if (critpath)
2649                                 _cache_cleanpos(ncposflush);
2650                         else
2651                                 _cache_cleanpos(ncposflush +
2652                                                 xnumcache - poslimit * 5 / 6);
2653                 } else {
2654                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
2655                 }
2656                 break;
2657         }
2658
2659         /*
2660          * Clean out dangling defered-zap ncps which could not
2661          * be cleanly dropped if too many build up.  Note
2662          * that numdefered is not an exact number as such ncps
2663          * can be reused and the counter is not handled in a MP
2664          * safe manner by design.
2665          */
2666         if (numdefered > neglimit) {
2667                 _cache_cleandefered();
2668         }
2669 }
2670
2671 /*
2672  * NEW NAMECACHE LOOKUP API
2673  *
2674  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2675  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2676  * is ALWAYS returned, eve if the supplied component is illegal.
2677  *
2678  * The resulting namecache entry should be returned to the system with
2679  * cache_put() or cache_unlock() + cache_drop().
2680  *
2681  * namecache locks are recursive but care must be taken to avoid lock order
2682  * reversals (hence why the passed par_nch must be unlocked).  Locking
2683  * rules are to order for parent traversals, not for child traversals.
2684  *
2685  * Nobody else will be able to manipulate the associated namespace (e.g.
2686  * create, delete, rename, rename-target) until the caller unlocks the
2687  * entry.
2688  *
2689  * The returned entry will be in one of three states:  positive hit (non-null
2690  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2691  * Unresolved entries must be resolved through the filesystem to associate the
2692  * vnode and/or determine whether a positive or negative hit has occured.
2693  *
2694  * It is not necessary to lock a directory in order to lock namespace under
2695  * that directory.  In fact, it is explicitly not allowed to do that.  A
2696  * directory is typically only locked when being created, renamed, or
2697  * destroyed.
2698  *
2699  * The directory (par) may be unresolved, in which case any returned child
2700  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2701  * the filesystem lookup requires a resolved directory vnode the caller is
2702  * responsible for resolving the namecache chain top-down.  This API
2703  * specifically allows whole chains to be created in an unresolved state.
2704  */
2705 struct nchandle
2706 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2707 {
2708         struct nchandle nch;
2709         struct namecache *ncp;
2710         struct namecache *new_ncp;
2711         struct nchash_head *nchpp;
2712         struct mount *mp;
2713         u_int32_t hash;
2714         globaldata_t gd;
2715         int par_locked;
2716
2717         numcalls++;
2718         gd = mycpu;
2719         mp = par_nch->mount;
2720         par_locked = 0;
2721
2722         /*
2723          * This is a good time to call it, no ncp's are locked by
2724          * the caller or us.
2725          */
2726         cache_hysteresis(1);
2727
2728         /*
2729          * Try to locate an existing entry
2730          */
2731         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2732         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2733         new_ncp = NULL;
2734         nchpp = NCHHASH(hash);
2735 restart:
2736         if (new_ncp)
2737                 spin_lock(&nchpp->spin);
2738         else
2739                 spin_lock_shared(&nchpp->spin);
2740
2741         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2742                 numchecks++;
2743
2744                 /*
2745                  * Break out if we find a matching entry.  Note that
2746                  * UNRESOLVED entries may match, but DESTROYED entries
2747                  * do not.
2748                  */
2749                 if (ncp->nc_parent == par_nch->ncp &&
2750                     ncp->nc_nlen == nlc->nlc_namelen &&
2751                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2752                     (ncp->nc_flag & NCF_DESTROYED) == 0
2753                 ) {
2754                         _cache_hold(ncp);
2755                         if (new_ncp)
2756                                 spin_unlock(&nchpp->spin);
2757                         else
2758                                 spin_unlock_shared(&nchpp->spin);
2759                         if (par_locked) {
2760                                 _cache_unlock(par_nch->ncp);
2761                                 par_locked = 0;
2762                         }
2763                         if (_cache_lock_special(ncp) == 0) {
2764                                 /*
2765                                  * Successfully locked but we must re-test
2766                                  * conditions that might have changed since
2767                                  * we did not have the lock before.
2768                                  */
2769                                 if (ncp->nc_parent != par_nch->ncp ||
2770                                     ncp->nc_nlen != nlc->nlc_namelen ||
2771                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
2772                                          ncp->nc_nlen) ||
2773                                     (ncp->nc_flag & NCF_DESTROYED)) {
2774                                         _cache_put(ncp);
2775                                         goto restart;
2776                                 }
2777                                 _cache_auto_unresolve(mp, ncp);
2778                                 if (new_ncp)
2779                                         _cache_free(new_ncp);
2780                                 goto found;
2781                         }
2782                         _cache_get(ncp);        /* cycle the lock to block */
2783                         _cache_put(ncp);
2784                         _cache_drop(ncp);
2785                         goto restart;
2786                 }
2787         }
2788
2789         /*
2790          * We failed to locate an entry, create a new entry and add it to
2791          * the cache.  The parent ncp must also be locked so we
2792          * can link into it.
2793          *
2794          * We have to relookup after possibly blocking in kmalloc or
2795          * when locking par_nch.
2796          *
2797          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2798          *       mount case, in which case nc_name will be NULL.
2799          */
2800         if (new_ncp == NULL) {
2801                 spin_unlock_shared(&nchpp->spin);
2802                 new_ncp = cache_alloc(nlc->nlc_namelen);
2803                 if (nlc->nlc_namelen) {
2804                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2805                               nlc->nlc_namelen);
2806                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2807                 }
2808                 goto restart;
2809         }
2810
2811         /*
2812          * NOTE! The spinlock is held exclusively here because new_ncp
2813          *       is non-NULL.
2814          */
2815         if (par_locked == 0) {
2816                 spin_unlock(&nchpp->spin);
2817                 _cache_lock(par_nch->ncp);
2818                 par_locked = 1;
2819                 goto restart;
2820         }
2821
2822         /*
2823          * WARNING!  We still hold the spinlock.  We have to set the hash
2824          *           table entry atomically.
2825          */
2826         ncp = new_ncp;
2827         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2828         spin_unlock(&nchpp->spin);
2829         _cache_unlock(par_nch->ncp);
2830         /* par_locked = 0 - not used */
2831 found:
2832         /*
2833          * stats and namecache size management
2834          */
2835         if (ncp->nc_flag & NCF_UNRESOLVED)
2836                 ++gd->gd_nchstats->ncs_miss;
2837         else if (ncp->nc_vp)
2838                 ++gd->gd_nchstats->ncs_goodhits;
2839         else
2840                 ++gd->gd_nchstats->ncs_neghits;
2841         nch.mount = mp;
2842         nch.ncp = ncp;
2843         atomic_add_int(&nch.mount->mnt_refs, 1);
2844         return(nch);
2845 }
2846
2847 /*
2848  * Attempt to lookup a namecache entry and return with a shared namecache
2849  * lock.
2850  */
2851 int
2852 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
2853                            int excl, struct nchandle *res_nch)
2854 {
2855         struct namecache *ncp;
2856         struct nchash_head *nchpp;
2857         struct mount *mp;
2858         u_int32_t hash;
2859         globaldata_t gd;
2860
2861         /*
2862          * If exclusive requested or shared namecache locks are disabled,
2863          * return failure.
2864          */
2865         if (ncp_shared_lock_disable || excl)
2866                 return(EWOULDBLOCK);
2867
2868         numcalls++;
2869         gd = mycpu;
2870         mp = par_nch->mount;
2871
2872         /*
2873          * This is a good time to call it, no ncp's are locked by
2874          * the caller or us.
2875          */
2876         cache_hysteresis(1);
2877
2878         /*
2879          * Try to locate an existing entry
2880          */
2881         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2882         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2883         nchpp = NCHHASH(hash);
2884
2885         spin_lock_shared(&nchpp->spin);
2886
2887         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2888                 numchecks++;
2889
2890                 /*
2891                  * Break out if we find a matching entry.  Note that
2892                  * UNRESOLVED entries may match, but DESTROYED entries
2893                  * do not.
2894                  */
2895                 if (ncp->nc_parent == par_nch->ncp &&
2896                     ncp->nc_nlen == nlc->nlc_namelen &&
2897                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2898                     (ncp->nc_flag & NCF_DESTROYED) == 0
2899                 ) {
2900                         _cache_hold(ncp);
2901                         spin_unlock_shared(&nchpp->spin);
2902                         if (_cache_lock_shared_special(ncp) == 0) {
2903                                 if (ncp->nc_parent == par_nch->ncp &&
2904                                     ncp->nc_nlen == nlc->nlc_namelen &&
2905                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
2906                                          ncp->nc_nlen) == 0 &&
2907                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
2908                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2909                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
2910                                         goto found;
2911                                 }
2912                                 _cache_unlock(ncp);
2913                         }
2914                         _cache_drop(ncp);
2915                         spin_lock_shared(&nchpp->spin);
2916                         break;
2917                 }
2918         }
2919
2920         /*
2921          * Failure
2922          */
2923         spin_unlock_shared(&nchpp->spin);
2924         return(EWOULDBLOCK);
2925
2926         /*
2927          * Success
2928          *
2929          * Note that nc_error might be non-zero (e.g ENOENT).
2930          */
2931 found:
2932         res_nch->mount = mp;
2933         res_nch->ncp = ncp;
2934         ++gd->gd_nchstats->ncs_goodhits;
2935         atomic_add_int(&res_nch->mount->mnt_refs, 1);
2936
2937         KKASSERT(ncp->nc_error != EWOULDBLOCK);
2938         return(ncp->nc_error);
2939 }
2940
2941 /*
2942  * This is a non-blocking verison of cache_nlookup() used by
2943  * nfs_readdirplusrpc_uio().  It can fail for any reason and
2944  * will return nch.ncp == NULL in that case.
2945  */
2946 struct nchandle
2947 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
2948 {
2949         struct nchandle nch;
2950         struct namecache *ncp;
2951         struct namecache *new_ncp;
2952         struct nchash_head *nchpp;
2953         struct mount *mp;
2954         u_int32_t hash;
2955         globaldata_t gd;
2956         int par_locked;
2957
2958         numcalls++;
2959         gd = mycpu;
2960         mp = par_nch->mount;
2961         par_locked = 0;
2962
2963         /*
2964          * Try to locate an existing entry
2965          */
2966         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2967         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2968         new_ncp = NULL;
2969         nchpp = NCHHASH(hash);
2970 restart:
2971         spin_lock(&nchpp->spin);
2972         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2973                 numchecks++;
2974
2975                 /*
2976                  * Break out if we find a matching entry.  Note that
2977                  * UNRESOLVED entries may match, but DESTROYED entries
2978                  * do not.
2979                  */
2980                 if (ncp->nc_parent == par_nch->ncp &&
2981                     ncp->nc_nlen == nlc->nlc_namelen &&
2982                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2983                     (ncp->nc_flag & NCF_DESTROYED) == 0
2984                 ) {
2985                         _cache_hold(ncp);
2986                         spin_unlock(&nchpp->spin);
2987                         if (par_locked) {
2988                                 _cache_unlock(par_nch->ncp);
2989                                 par_locked = 0;
2990                         }
2991                         if (_cache_lock_special(ncp) == 0) {
2992                                 _cache_auto_unresolve(mp, ncp);
2993                                 if (new_ncp) {
2994                                         _cache_free(new_ncp);
2995                                         new_ncp = NULL;
2996                                 }
2997                                 goto found;
2998                         }
2999                         _cache_drop(ncp);
3000                         goto failed;
3001                 }
3002         }
3003
3004         /*
3005          * We failed to locate an entry, create a new entry and add it to
3006          * the cache.  The parent ncp must also be locked so we
3007          * can link into it.
3008          *
3009          * We have to relookup after possibly blocking in kmalloc or
3010          * when locking par_nch.
3011          *
3012          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3013          *       mount case, in which case nc_name will be NULL.
3014          */
3015         if (new_ncp == NULL) {
3016                 spin_unlock(&nchpp->spin);
3017                 new_ncp = cache_alloc(nlc->nlc_namelen);
3018                 if (nlc->nlc_namelen) {
3019                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3020                               nlc->nlc_namelen);
3021                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3022                 }
3023                 goto restart;
3024         }
3025         if (par_locked == 0) {
3026                 spin_unlock(&nchpp->spin);
3027                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3028                         par_locked = 1;
3029                         goto restart;
3030                 }
3031                 goto failed;
3032         }
3033
3034         /*
3035          * WARNING!  We still hold the spinlock.  We have to set the hash
3036          *           table entry atomically.
3037          */
3038         ncp = new_ncp;
3039         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3040         spin_unlock(&nchpp->spin);
3041         _cache_unlock(par_nch->ncp);
3042         /* par_locked = 0 - not used */
3043 found:
3044         /*
3045          * stats and namecache size management
3046          */
3047         if (ncp->nc_flag & NCF_UNRESOLVED)
3048                 ++gd->gd_nchstats->ncs_miss;
3049         else if (ncp->nc_vp)
3050                 ++gd->gd_nchstats->ncs_goodhits;
3051         else
3052                 ++gd->gd_nchstats->ncs_neghits;
3053         nch.mount = mp;
3054         nch.ncp = ncp;
3055         atomic_add_int(&nch.mount->mnt_refs, 1);
3056         return(nch);
3057 failed:
3058         if (new_ncp) {
3059                 _cache_free(new_ncp);
3060                 new_ncp = NULL;
3061         }
3062         nch.mount = NULL;
3063         nch.ncp = NULL;
3064         return(nch);
3065 }
3066
3067 /*
3068  * The namecache entry is marked as being used as a mount point.
3069  * Locate the mount if it is visible to the caller.  The DragonFly
3070  * mount system allows arbitrary loops in the topology and disentangles
3071  * those loops by matching against (mp, ncp) rather than just (ncp).
3072  * This means any given ncp can dive any number of mounts, depending
3073  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3074  *
3075  * We use a very simple frontend cache to reduce SMP conflicts,
3076  * which we have to do because the mountlist scan needs an exclusive
3077  * lock around its ripout info list.  Not to mention that there might
3078  * be a lot of mounts.
3079  */
3080 struct findmount_info {
3081         struct mount *result;
3082         struct mount *nch_mount;
3083         struct namecache *nch_ncp;
3084 };
3085
3086 static
3087 struct ncmount_cache *
3088 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3089 {
3090         int hash;
3091
3092         hash = ((int)(intptr_t)mp / sizeof(*mp)) ^
3093                ((int)(intptr_t)ncp / sizeof(*ncp));
3094         hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE;
3095         return (&ncmount_cache[hash]);
3096 }
3097
3098 static
3099 int
3100 cache_findmount_callback(struct mount *mp, void *data)
3101 {
3102         struct findmount_info *info = data;
3103
3104         /*
3105          * Check the mount's mounted-on point against the passed nch.
3106          */
3107         if (mp->mnt_ncmounton.mount == info->nch_mount &&
3108             mp->mnt_ncmounton.ncp == info->nch_ncp
3109         ) {
3110             info->result = mp;
3111             atomic_add_int(&mp->mnt_refs, 1);
3112             return(-1);
3113         }
3114         return(0);
3115 }
3116
3117 struct mount *
3118 cache_findmount(struct nchandle *nch)
3119 {
3120         struct findmount_info info;
3121         struct ncmount_cache *ncc;
3122         struct mount *mp;
3123
3124         /*
3125          * Fast
3126          */
3127         if (ncmount_cache_enable == 0) {
3128                 ncc = NULL;
3129                 goto skip;
3130         }
3131         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3132         if (ncc->ncp == nch->ncp) {
3133                 spin_lock_shared(&ncc->spin);
3134                 if (ncc->isneg == 0 &&
3135                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
3136                         if (mp->mnt_ncmounton.mount == nch->mount &&
3137                             mp->mnt_ncmounton.ncp == nch->ncp) {
3138                                 /*
3139                                  * Cache hit (positive)
3140                                  */
3141                                 atomic_add_int(&mp->mnt_refs, 1);
3142                                 spin_unlock_shared(&ncc->spin);
3143                                 ++ncmount_cache_hit;
3144                                 return(mp);
3145                         }
3146                         /* else cache miss */
3147                 }
3148                 if (ncc->isneg &&
3149                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3150                         /*
3151                          * Cache hit (negative)
3152                          */
3153                         spin_unlock_shared(&ncc->spin);
3154                         ++ncmount_cache_hit;
3155                         return(NULL);
3156                 }
3157                 spin_unlock_shared(&ncc->spin);
3158         }
3159 skip:
3160
3161         /*
3162          * Slow
3163          */
3164         info.result = NULL;
3165         info.nch_mount = nch->mount;
3166         info.nch_ncp = nch->ncp;
3167         mountlist_scan(cache_findmount_callback, &info,
3168                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
3169
3170         /*
3171          * Cache the result.
3172          *
3173          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
3174          *                   only used for pointer comparisons and is not
3175          *                   referenced (otherwise there would be dangling
3176          *                   refs).
3177          *
3178          * Positive lookups: We cache the originating {ncp} and the target
3179          *                   (mp).  (mp) is referenced.
3180          *
3181          * Indeterminant:    If the match is undergoing an unmount we do
3182          *                   not cache it to avoid racing cache_unmounting(),
3183          *                   but still return the match.
3184          */
3185         if (ncc) {
3186                 spin_lock(&ncc->spin);
3187                 if (info.result == NULL) {
3188                         if (ncc->isneg == 0 && ncc->mp)
3189                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3190                         ncc->ncp = nch->ncp;
3191                         ncc->mp = nch->mount;
3192                         ncc->isneg = 1;
3193                         spin_unlock(&ncc->spin);
3194                         ++ncmount_cache_overwrite;
3195                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
3196                         if (ncc->isneg == 0 && ncc->mp)
3197                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3198                         atomic_add_int(&info.result->mnt_refs, 1);
3199                         ncc->ncp = nch->ncp;
3200                         ncc->mp = info.result;
3201                         ncc->isneg = 0;
3202                         spin_unlock(&ncc->spin);
3203                         ++ncmount_cache_overwrite;
3204                 } else {
3205                         spin_unlock(&ncc->spin);
3206                 }
3207                 ++ncmount_cache_miss;
3208         }
3209         return(info.result);
3210 }
3211
3212 void
3213 cache_dropmount(struct mount *mp)
3214 {
3215         atomic_add_int(&mp->mnt_refs, -1);
3216 }
3217
3218 void
3219 cache_ismounting(struct mount *mp)
3220 {
3221         struct nchandle *nch = &mp->mnt_ncmounton;
3222         struct ncmount_cache *ncc;
3223
3224         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3225         if (ncc->isneg &&
3226             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3227                 spin_lock(&ncc->spin);
3228                 if (ncc->isneg &&
3229                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3230                         ncc->ncp = NULL;
3231                         ncc->mp = NULL;
3232                 }
3233                 spin_unlock(&ncc->spin);
3234         }
3235 }
3236
3237 void
3238 cache_unmounting(struct mount *mp)
3239 {
3240         struct nchandle *nch = &mp->mnt_ncmounton;
3241         struct ncmount_cache *ncc;
3242
3243         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3244         if (ncc->isneg == 0 &&
3245             ncc->ncp == nch->ncp && ncc->mp == mp) {
3246                 spin_lock(&ncc->spin);
3247                 if (ncc->isneg == 0 &&
3248                     ncc->ncp == nch->ncp && ncc->mp == mp) {
3249                         atomic_add_int(&mp->mnt_refs, -1);
3250                         ncc->ncp = NULL;
3251                         ncc->mp = NULL;
3252                 }
3253                 spin_unlock(&ncc->spin);
3254         }
3255 }
3256
3257 /*
3258  * Resolve an unresolved namecache entry, generally by looking it up.
3259  * The passed ncp must be locked and refd.
3260  *
3261  * Theoretically since a vnode cannot be recycled while held, and since
3262  * the nc_parent chain holds its vnode as long as children exist, the
3263  * direct parent of the cache entry we are trying to resolve should
3264  * have a valid vnode.  If not then generate an error that we can
3265  * determine is related to a resolver bug.
3266  *
3267  * However, if a vnode was in the middle of a recyclement when the NCP
3268  * got locked, ncp->nc_vp might point to a vnode that is about to become
3269  * invalid.  cache_resolve() handles this case by unresolving the entry
3270  * and then re-resolving it.
3271  *
3272  * Note that successful resolution does not necessarily return an error
3273  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3274  * will be returned.
3275  */
3276 int
3277 cache_resolve(struct nchandle *nch, struct ucred *cred)
3278 {
3279         struct namecache *par_tmp;
3280         struct namecache *par;
3281         struct namecache *ncp;
3282         struct nchandle nctmp;
3283         struct mount *mp;
3284         struct vnode *dvp;
3285         int error;
3286
3287         ncp = nch->ncp;
3288         mp = nch->mount;
3289         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3290 restart:
3291         /*
3292          * If the ncp is already resolved we have nothing to do.  However,
3293          * we do want to guarentee that a usable vnode is returned when
3294          * a vnode is present, so make sure it hasn't been reclaimed.
3295          */
3296         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3297                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3298                         _cache_setunresolved(ncp);
3299                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3300                         return (ncp->nc_error);
3301         }
3302
3303         /*
3304          * If the ncp was destroyed it will never resolve again.  This
3305          * can basically only happen when someone is chdir'd into an
3306          * empty directory which is then rmdir'd.  We want to catch this
3307          * here and not dive the VFS because the VFS might actually
3308          * have a way to re-resolve the disconnected ncp, which will
3309          * result in inconsistencies in the cdir/nch for proc->p_fd.
3310          */
3311         if (ncp->nc_flag & NCF_DESTROYED) {
3312                 kprintf("Warning: cache_resolve: ncp '%s' was unlinked\n",
3313                         ncp->nc_name);
3314                 return(EINVAL);
3315         }
3316
3317         /*
3318          * Mount points need special handling because the parent does not
3319          * belong to the same filesystem as the ncp.
3320          */
3321         if (ncp == mp->mnt_ncmountpt.ncp)
3322                 return (cache_resolve_mp(mp));
3323
3324         /*
3325          * We expect an unbroken chain of ncps to at least the mount point,
3326          * and even all the way to root (but this code doesn't have to go
3327          * past the mount point).
3328          */
3329         if (ncp->nc_parent == NULL) {
3330                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3331                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3332                 ncp->nc_error = EXDEV;
3333                 return(ncp->nc_error);
3334         }
3335
3336         /*
3337          * The vp's of the parent directories in the chain are held via vhold()
3338          * due to the existance of the child, and should not disappear.
3339          * However, there are cases where they can disappear:
3340          *
3341          *      - due to filesystem I/O errors.
3342          *      - due to NFS being stupid about tracking the namespace and
3343          *        destroys the namespace for entire directories quite often.
3344          *      - due to forced unmounts.
3345          *      - due to an rmdir (parent will be marked DESTROYED)
3346          *
3347          * When this occurs we have to track the chain backwards and resolve
3348          * it, looping until the resolver catches up to the current node.  We
3349          * could recurse here but we might run ourselves out of kernel stack
3350          * so we do it in a more painful manner.  This situation really should
3351          * not occur all that often, or if it does not have to go back too
3352          * many nodes to resolve the ncp.
3353          */
3354         while ((dvp = cache_dvpref(ncp)) == NULL) {
3355                 /*
3356                  * This case can occur if a process is CD'd into a
3357                  * directory which is then rmdir'd.  If the parent is marked
3358                  * destroyed there is no point trying to resolve it.
3359                  */
3360                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3361                         return(ENOENT);
3362                 par = ncp->nc_parent;
3363                 _cache_hold(par);
3364                 _cache_lock(par);
3365                 while ((par_tmp = par->nc_parent) != NULL &&
3366                        par_tmp->nc_vp == NULL) {
3367                         _cache_hold(par_tmp);
3368                         _cache_lock(par_tmp);
3369                         _cache_put(par);
3370                         par = par_tmp;
3371                 }
3372                 if (par->nc_parent == NULL) {
3373                         kprintf("EXDEV case 2 %*.*s\n",
3374                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3375                         _cache_put(par);
3376                         return (EXDEV);
3377                 }
3378                 /*
3379                  * The parent is not set in stone, ref and lock it to prevent
3380                  * it from disappearing.  Also note that due to renames it
3381                  * is possible for our ncp to move and for par to no longer
3382                  * be one of its parents.  We resolve it anyway, the loop
3383                  * will handle any moves.
3384                  */
3385                 _cache_get(par);        /* additional hold/lock */
3386                 _cache_put(par);        /* from earlier hold/lock */
3387                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3388                         cache_resolve_mp(nch->mount);
3389                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3390                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
3391                         _cache_put(par);
3392                         continue;
3393                 } else {
3394                         if (par->nc_flag & NCF_UNRESOLVED) {
3395                                 nctmp.mount = mp;
3396                                 nctmp.ncp = par;
3397                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3398                         }
3399                         vrele(dvp);
3400                 }
3401                 if ((error = par->nc_error) != 0) {
3402                         if (par->nc_error != EAGAIN) {
3403                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3404                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3405                                     par->nc_error);
3406                                 _cache_put(par);
3407                                 return(error);
3408                         }
3409                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3410                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3411                 }
3412                 _cache_put(par);
3413                 /* loop */
3414         }
3415
3416         /*
3417          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3418          * ncp's and reattach them.  If this occurs the original ncp is marked
3419          * EAGAIN to force a relookup.
3420          *
3421          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3422          * ncp must already be resolved.
3423          */
3424         if (dvp) {
3425                 nctmp.mount = mp;
3426                 nctmp.ncp = ncp;
3427                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3428                 vrele(dvp);
3429         } else {
3430                 ncp->nc_error = EPERM;
3431         }
3432         if (ncp->nc_error == EAGAIN) {
3433                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3434                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3435                 goto restart;
3436         }
3437         return(ncp->nc_error);
3438 }
3439
3440 /*
3441  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3442  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3443  * re-resolution more often due to its mac-truck-smash-the-namecache
3444  * method of tracking namespace changes.
3445  *
3446  * The semantics for this call is that the passed ncp must be locked on
3447  * entry and will be locked on return.  However, if we actually have to
3448  * resolve the mount point we temporarily unlock the entry in order to
3449  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3450  * the unlock we have to recheck the flags after we relock.
3451  */
3452 static int
3453 cache_resolve_mp(struct mount *mp)
3454 {
3455         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3456         struct vnode *vp;
3457         int error;
3458
3459         KKASSERT(mp != NULL);
3460
3461         /*
3462          * If the ncp is already resolved we have nothing to do.  However,
3463          * we do want to guarentee that a usable vnode is returned when
3464          * a vnode is present, so make sure it hasn't been reclaimed.
3465          */
3466         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3467                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3468                         _cache_setunresolved(ncp);
3469         }
3470
3471         if (ncp->nc_flag & NCF_UNRESOLVED) {
3472                 _cache_unlock(ncp);
3473                 while (vfs_busy(mp, 0))
3474                         ;
3475                 error = VFS_ROOT(mp, &vp);
3476                 _cache_lock(ncp);
3477
3478                 /*
3479                  * recheck the ncp state after relocking.
3480                  */
3481                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3482                         ncp->nc_error = error;
3483                         if (error == 0) {
3484                                 _cache_setvp(mp, ncp, vp);
3485                                 vput(vp);
3486                         } else {
3487                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3488                                         " to resolve mount %p err=%d ncp=%p\n",
3489                                         mp, error, ncp);
3490                                 _cache_setvp(mp, ncp, NULL);
3491                         }
3492                 } else if (error == 0) {
3493                         vput(vp);
3494                 }
3495                 vfs_unbusy(mp);
3496         }
3497         return(ncp->nc_error);
3498 }
3499
3500 /*
3501  * Clean out negative cache entries when too many have accumulated.
3502  */
3503 static void
3504 _cache_cleanneg(int count)
3505 {
3506         struct namecache *ncp;
3507
3508         /*
3509          * Attempt to clean out the specified number of negative cache
3510          * entries.
3511          */
3512         while (count) {
3513                 spin_lock(&ncspin);
3514                 ncp = TAILQ_FIRST(&ncneglist);
3515                 if (ncp == NULL) {
3516                         spin_unlock(&ncspin);
3517                         break;
3518                 }
3519                 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
3520                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
3521                 _cache_hold(ncp);
3522                 spin_unlock(&ncspin);
3523
3524                 /*
3525                  * This can race, so we must re-check that the ncp
3526                  * is on the ncneglist after successfully locking it.
3527                  */
3528                 if (_cache_lock_special(ncp) == 0) {
3529                         if (ncp->nc_vp == NULL &&
3530                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3531                                 ncp = cache_zap(ncp, 1);
3532                                 if (ncp)
3533                                         _cache_drop(ncp);
3534                         } else {
3535                                 kprintf("cache_cleanneg: race avoided\n");
3536                                 _cache_unlock(ncp);
3537                         }
3538                 } else {
3539                         _cache_drop(ncp);
3540                 }
3541                 --count;
3542         }
3543 }
3544
3545 /*
3546  * Clean out positive cache entries when too many have accumulated.
3547  */
3548 static void
3549 _cache_cleanpos(int count)
3550 {
3551         static volatile int rover;
3552         struct nchash_head *nchpp;
3553         struct namecache *ncp;
3554         int rover_copy;
3555
3556         /*
3557          * Attempt to clean out the specified number of negative cache
3558          * entries.
3559          */
3560         while (count) {
3561                 rover_copy = ++rover;   /* MPSAFEENOUGH */
3562                 cpu_ccfence();
3563                 nchpp = NCHHASH(rover_copy);
3564
3565                 spin_lock_shared(&nchpp->spin);
3566                 ncp = LIST_FIRST(&nchpp->list);
3567                 while (ncp && (ncp->nc_flag & NCF_DESTROYED))
3568                         ncp = LIST_NEXT(ncp, nc_hash);
3569                 if (ncp)
3570                         _cache_hold(ncp);
3571                 spin_unlock_shared(&nchpp->spin);
3572
3573                 if (ncp) {
3574                         if (_cache_lock_special(ncp) == 0) {
3575                                 ncp = cache_zap(ncp, 1);
3576                                 if (ncp)
3577                                         _cache_drop(ncp);
3578                         } else {
3579                                 _cache_drop(ncp);
3580                         }
3581                 }
3582                 --count;
3583         }
3584 }
3585
3586 /*
3587  * This is a kitchen sink function to clean out ncps which we
3588  * tried to zap from cache_drop() but failed because we were
3589  * unable to acquire the parent lock.
3590  *
3591  * Such entries can also be removed via cache_inval_vp(), such
3592  * as when unmounting.
3593  */
3594 static void
3595 _cache_cleandefered(void)
3596 {
3597         struct nchash_head *nchpp;
3598         struct namecache *ncp;
3599         struct namecache dummy;
3600         int i;
3601
3602         numdefered = 0;
3603         bzero(&dummy, sizeof(dummy));
3604         dummy.nc_flag = NCF_DESTROYED;
3605         dummy.nc_refs = 1;
3606
3607         for (i = 0; i <= nchash; ++i) {
3608                 nchpp = &nchashtbl[i];
3609
3610                 spin_lock(&nchpp->spin);
3611                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
3612                 ncp = &dummy;
3613                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
3614                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
3615                                 continue;
3616                         LIST_REMOVE(&dummy, nc_hash);
3617                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
3618                         _cache_hold(ncp);
3619                         spin_unlock(&nchpp->spin);
3620                         if (_cache_lock_nonblock(ncp) == 0) {
3621                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
3622                                 _cache_unlock(ncp);
3623                         }
3624                         _cache_drop(ncp);
3625                         spin_lock(&nchpp->spin);
3626                         ncp = &dummy;
3627                 }
3628                 LIST_REMOVE(&dummy, nc_hash);
3629                 spin_unlock(&nchpp->spin);
3630         }
3631 }
3632
3633 /*
3634  * Name cache initialization, from vfsinit() when we are booting
3635  */
3636 void
3637 nchinit(void)
3638 {
3639         int i;
3640         globaldata_t gd;
3641
3642         /* initialise per-cpu namecache effectiveness statistics. */
3643         for (i = 0; i < ncpus; ++i) {
3644                 gd = globaldata_find(i);
3645                 gd->gd_nchstats = &nchstats[i];
3646         }
3647         TAILQ_INIT(&ncneglist);
3648         spin_init(&ncspin, "nchinit");
3649         nchashtbl = hashinit_ext(desiredvnodes / 2,
3650                                  sizeof(struct nchash_head),
3651                                  M_VFSCACHE, &nchash);
3652         for (i = 0; i <= (int)nchash; ++i) {
3653                 LIST_INIT(&nchashtbl[i].list);
3654                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
3655         }
3656         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
3657                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
3658         nclockwarn = 5 * hz;
3659 }
3660
3661 /*
3662  * Called from start_init() to bootstrap the root filesystem.  Returns
3663  * a referenced, unlocked namecache record.
3664  */
3665 void
3666 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
3667 {
3668         nch->ncp = cache_alloc(0);
3669         nch->mount = mp;
3670         atomic_add_int(&mp->mnt_refs, 1);
3671         if (vp)
3672                 _cache_setvp(nch->mount, nch->ncp, vp);
3673 }
3674
3675 /*
3676  * vfs_cache_setroot()
3677  *
3678  *      Create an association between the root of our namecache and
3679  *      the root vnode.  This routine may be called several times during
3680  *      booting.
3681  *
3682  *      If the caller intends to save the returned namecache pointer somewhere
3683  *      it must cache_hold() it.
3684  */
3685 void
3686 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
3687 {
3688         struct vnode *ovp;
3689         struct nchandle onch;
3690
3691         ovp = rootvnode;
3692         onch = rootnch;
3693         rootvnode = nvp;
3694         if (nch)
3695                 rootnch = *nch;
3696         else
3697                 cache_zero(&rootnch);
3698         if (ovp)
3699                 vrele(ovp);
3700         if (onch.ncp)
3701                 cache_drop(&onch);
3702 }
3703
3704 /*
3705  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
3706  * topology and is being removed as quickly as possible.  The new VOP_N*()
3707  * API calls are required to make specific adjustments using the supplied
3708  * ncp pointers rather then just bogusly purging random vnodes.
3709  *
3710  * Invalidate all namecache entries to a particular vnode as well as
3711  * any direct children of that vnode in the namecache.  This is a
3712  * 'catch all' purge used by filesystems that do not know any better.
3713  *
3714  * Note that the linkage between the vnode and its namecache entries will
3715  * be removed, but the namecache entries themselves might stay put due to
3716  * active references from elsewhere in the system or due to the existance of
3717  * the children.   The namecache topology is left intact even if we do not
3718  * know what the vnode association is.  Such entries will be marked
3719  * NCF_UNRESOLVED.
3720  */
3721 void
3722 cache_purge(struct vnode *vp)
3723 {
3724         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
3725 }
3726
3727 static int disablecwd;
3728 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
3729     "Disable getcwd");
3730
3731 static u_long numcwdcalls;
3732 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
3733     "Number of current directory resolution calls");
3734 static u_long numcwdfailnf;
3735 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
3736     "Number of current directory failures due to lack of file");
3737 static u_long numcwdfailsz;
3738 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
3739     "Number of current directory failures due to large result");
3740 static u_long numcwdfound;
3741 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
3742     "Number of current directory resolution successes");
3743
3744 /*
3745  * MPALMOSTSAFE
3746  */
3747 int
3748 sys___getcwd(struct __getcwd_args *uap)
3749 {
3750         u_int buflen;
3751         int error;
3752         char *buf;
3753         char *bp;
3754
3755         if (disablecwd)
3756                 return (ENODEV);
3757
3758         buflen = uap->buflen;
3759         if (buflen == 0)
3760                 return (EINVAL);
3761         if (buflen > MAXPATHLEN)
3762                 buflen = MAXPATHLEN;
3763
3764         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
3765         bp = kern_getcwd(buf, buflen, &error);
3766         if (error == 0)
3767                 error = copyout(bp, uap->buf, strlen(bp) + 1);
3768         kfree(buf, M_TEMP);
3769         return (error);
3770 }
3771
3772 char *
3773 kern_getcwd(char *buf, size_t buflen, int *error)
3774 {
3775         struct proc *p = curproc;
3776         char *bp;
3777         int i, slash_prefixed;
3778         struct filedesc *fdp;
3779         struct nchandle nch;
3780         struct namecache *ncp;
3781
3782         numcwdcalls++;
3783         bp = buf;
3784         bp += buflen - 1;
3785         *bp = '\0';
3786         fdp = p->p_fd;
3787         slash_prefixed = 0;
3788
3789         nch = fdp->fd_ncdir;
3790         ncp = nch.ncp;
3791         if (ncp)
3792                 _cache_hold(ncp);
3793
3794         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
3795                nch.mount != fdp->fd_nrdir.mount)
3796         ) {
3797                 /*
3798                  * While traversing upwards if we encounter the root
3799                  * of the current mount we have to skip to the mount point
3800                  * in the underlying filesystem.
3801                  */
3802                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
3803                         nch = nch.mount->mnt_ncmounton;
3804                         _cache_drop(ncp);
3805                         ncp = nch.ncp;
3806                         if (ncp)
3807                                 _cache_hold(ncp);
3808                         continue;
3809                 }
3810
3811                 /*
3812                  * Prepend the path segment
3813                  */
3814                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3815                         if (bp == buf) {
3816                                 numcwdfailsz++;
3817                                 *error = ERANGE;
3818                                 bp = NULL;
3819                                 goto done;
3820                         }
3821                         *--bp = ncp->nc_name[i];
3822                 }
3823                 if (bp == buf) {
3824                         numcwdfailsz++;
3825                         *error = ERANGE;
3826                         bp = NULL;
3827                         goto done;
3828                 }
3829                 *--bp = '/';
3830                 slash_prefixed = 1;
3831
3832                 /*
3833                  * Go up a directory.  This isn't a mount point so we don't
3834                  * have to check again.
3835                  */
3836                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3837                         if (ncp_shared_lock_disable)
3838                                 _cache_lock(ncp);
3839                         else
3840                                 _cache_lock_shared(ncp);
3841                         if (nch.ncp != ncp->nc_parent) {
3842                                 _cache_unlock(ncp);
3843                                 continue;
3844                         }
3845                         _cache_hold(nch.ncp);
3846                         _cache_unlock(ncp);
3847                         break;
3848                 }
3849                 _cache_drop(ncp);
3850                 ncp = nch.ncp;
3851         }
3852         if (ncp == NULL) {
3853                 numcwdfailnf++;
3854                 *error = ENOENT;
3855                 bp = NULL;
3856                 goto done;
3857         }
3858         if (!slash_prefixed) {
3859                 if (bp == buf) {
3860                         numcwdfailsz++;
3861                         *error = ERANGE;
3862                         bp = NULL;
3863                         goto done;
3864                 }
3865                 *--bp = '/';
3866         }
3867         numcwdfound++;
3868         *error = 0;
3869 done:
3870         if (ncp)
3871                 _cache_drop(ncp);
3872         return (bp);
3873 }
3874
3875 /*
3876  * Thus begins the fullpath magic.
3877  *
3878  * The passed nchp is referenced but not locked.
3879  */
3880 static int disablefullpath;
3881 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
3882     &disablefullpath, 0,
3883     "Disable fullpath lookups");
3884
3885 static u_int numfullpathcalls;
3886 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
3887     &numfullpathcalls, 0,
3888     "Number of full path resolutions in progress");
3889 static u_int numfullpathfailnf;
3890 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
3891     &numfullpathfailnf, 0,
3892     "Number of full path resolution failures due to lack of file");
3893 static u_int numfullpathfailsz;
3894 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
3895     &numfullpathfailsz, 0,
3896     "Number of full path resolution failures due to insufficient memory");
3897 static u_int numfullpathfound;
3898 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
3899     &numfullpathfound, 0,
3900     "Number of full path resolution successes");
3901
3902 int
3903 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
3904                char **retbuf, char **freebuf, int guess)
3905 {
3906         struct nchandle fd_nrdir;
3907         struct nchandle nch;
3908         struct namecache *ncp;
3909         struct mount *mp, *new_mp;
3910         char *bp, *buf;
3911         int slash_prefixed;
3912         int error = 0;
3913         int i;
3914
3915         atomic_add_int(&numfullpathcalls, -1);
3916
3917         *retbuf = NULL;
3918         *freebuf = NULL;
3919
3920         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
3921         bp = buf + MAXPATHLEN - 1;
3922         *bp = '\0';
3923         if (nchbase)
3924                 fd_nrdir = *nchbase;
3925         else if (p != NULL)
3926                 fd_nrdir = p->p_fd->fd_nrdir;
3927         else
3928                 fd_nrdir = rootnch;
3929         slash_prefixed = 0;
3930         nch = *nchp;
3931         ncp = nch.ncp;
3932         if (ncp)
3933                 _cache_hold(ncp);
3934         mp = nch.mount;
3935
3936         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
3937                 new_mp = NULL;
3938
3939                 /*
3940                  * If we are asked to guess the upwards path, we do so whenever
3941                  * we encounter an ncp marked as a mountpoint. We try to find
3942                  * the actual mountpoint by finding the mountpoint with this
3943                  * ncp.
3944                  */
3945                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
3946                         new_mp = mount_get_by_nc(ncp);
3947                 }
3948                 /*
3949                  * While traversing upwards if we encounter the root
3950                  * of the current mount we have to skip to the mount point.
3951                  */
3952                 if (ncp == mp->mnt_ncmountpt.ncp) {
3953                         new_mp = mp;
3954                 }
3955                 if (new_mp) {
3956                         nch = new_mp->mnt_ncmounton;
3957                         _cache_drop(ncp);
3958                         ncp = nch.ncp;
3959                         if (ncp)
3960                                 _cache_hold(ncp);
3961                         mp = nch.mount;
3962                         continue;
3963                 }
3964
3965                 /*
3966                  * Prepend the path segment
3967                  */
3968                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3969                         if (bp == buf) {
3970                                 numfullpathfailsz++;
3971                                 kfree(buf, M_TEMP);
3972                                 error = ENOMEM;
3973                                 goto done;
3974                         }
3975                         *--bp = ncp->nc_name[i];
3976                 }
3977                 if (bp == buf) {
3978                         numfullpathfailsz++;
3979                         kfree(buf, M_TEMP);
3980                         error = ENOMEM;
3981                         goto done;
3982                 }
3983                 *--bp = '/';
3984                 slash_prefixed = 1;
3985
3986                 /*
3987                  * Go up a directory.  This isn't a mount point so we don't
3988                  * have to check again.
3989                  *
3990                  * We can only safely access nc_parent with ncp held locked.
3991                  */
3992                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3993                         _cache_lock(ncp);
3994                         if (nch.ncp != ncp->nc_parent) {
3995                                 _cache_unlock(ncp);
3996                                 continue;
3997                         }
3998                         _cache_hold(nch.ncp);
3999                         _cache_unlock(ncp);
4000                         break;
4001                 }
4002                 _cache_drop(ncp);
4003                 ncp = nch.ncp;
4004         }
4005         if (ncp == NULL) {
4006                 numfullpathfailnf++;
4007                 kfree(buf, M_TEMP);
4008                 error = ENOENT;
4009                 goto done;
4010         }
4011
4012         if (!slash_prefixed) {
4013                 if (bp == buf) {
4014                         numfullpathfailsz++;
4015                         kfree(buf, M_TEMP);
4016                         error = ENOMEM;
4017                         goto done;
4018                 }
4019                 *--bp = '/';
4020         }
4021         numfullpathfound++;
4022         *retbuf = bp;
4023         *freebuf = buf;
4024         error = 0;
4025 done:
4026         if (ncp)
4027                 _cache_drop(ncp);
4028         return(error);
4029 }
4030
4031 int
4032 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4033             char **freebuf, int guess)
4034 {
4035         struct namecache *ncp;
4036         struct nchandle nch;
4037         int error;
4038
4039         *freebuf = NULL;
4040         atomic_add_int(&numfullpathcalls, 1);
4041         if (disablefullpath)
4042                 return (ENODEV);
4043
4044         if (p == NULL)
4045                 return (EINVAL);
4046
4047         /* vn is NULL, client wants us to use p->p_textvp */
4048         if (vn == NULL) {
4049                 if ((vn = p->p_textvp) == NULL)
4050                         return (EINVAL);
4051         }
4052         spin_lock_shared(&vn->v_spin);
4053         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4054                 if (ncp->nc_nlen)
4055                         break;
4056         }
4057         if (ncp == NULL) {
4058                 spin_unlock_shared(&vn->v_spin);
4059                 return (EINVAL);
4060         }
4061         _cache_hold(ncp);
4062         spin_unlock_shared(&vn->v_spin);
4063
4064         atomic_add_int(&numfullpathcalls, -1);
4065         nch.ncp = ncp;
4066         nch.mount = vn->v_mount;
4067         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4068         _cache_drop(ncp);
4069         return (error);
4070 }