sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/mount.h>
  70 #include <sys/vnode.h>
  71 #include <sys/malloc.h>
  72 #include <sys/sysproto.h>
  73 #include <sys/spinlock.h>
  74 #include <sys/proc.h>
  75 #include <sys/namei.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/sysref2.h>
  85 #include <sys/spinlock2.h>
  86 #include <sys/mplock2.h>
  87
  88 #define MAX_RECURSION_DEPTH     64
  89
  90 /*
  91  * Random lookups in the cache are accomplished with a hash table using
  92  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
  93  *
  94  * Negative entries may exist and correspond to resolved namecache
  95  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  96  * will be set if the entry corresponds to a whited-out directory entry
  97  * (verses simply not finding the entry at all).   ncneglist is locked
  98  * with a global spinlock (ncspin).
  99  *
 100  * MPSAFE RULES:
 101  *
 102  * (1) A ncp must be referenced before it can be locked.
 103  *
 104  * (2) A ncp must be locked in order to modify it.
 105  *
 106  * (3) ncp locks are always ordered child -> parent.  That may seem
 107  *     backwards but forward scans use the hash table and thus can hold
 108  *     the parent unlocked when traversing downward.
 109  *
 110  *     This allows insert/rename/delete/dot-dot and other operations
 111  *     to use ncp->nc_parent links.
 112  *
 113  *     This also prevents a locked up e.g. NFS node from creating a
 114  *     chain reaction all the way back to the root vnode / namecache.
 115  *
 116  * (4) parent linkages require both the parent and child to be locked.
 117  */
 118
 119 /*
 120  * Structures associated with name cacheing.
 121  */
 122 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 123 #define MINNEG                  1024
 124 #define MINPOS                  1024
 125 #define NCMOUNT_NUMCACHE        1009    /* prime number */
 126
 127 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 128
 129 LIST_HEAD(nchash_list, namecache);
 130
 131 struct nchash_head {
 132        struct nchash_list list;
 133        struct spinlock  spin;
 134 };
 135
 136 struct ncmount_cache {
 137         struct spinlock spin;
 138         struct namecache *ncp;
 139         struct mount *mp;
 140         int isneg;              /* if != 0 mp is originator and not target */
 141 };
 142
 143 static struct nchash_head       *nchashtbl;
 144 static struct namecache_list    ncneglist;
 145 static struct spinlock          ncspin;
 146 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 147
 148 /*
 149  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 150  * to create the namecache infrastructure leading to a dangling vnode.
 151  *
 152  * 0    Only errors are reported
 153  * 1    Successes are reported
 154  * 2    Successes + the whole directory scan is reported
 155  * 3    Force the directory scan code run as if the parent vnode did not
 156  *      have a namecache record, even if it does have one.
 157  */
 158 static int      ncvp_debug;
 159 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 160     "Namecache debug level (0-3)");
 161
 162 static u_long   nchash;                 /* size of hash table */
 163 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 164     "Size of namecache hash table");
 165
 166 static int      ncnegflush = 10;        /* burst for negative flush */
 167 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 168     "Batch flush negative entries");
 169
 170 static int      ncposflush = 10;        /* burst for positive flush */
 171 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 172     "Batch flush positive entries");
 173
 174 static int      ncnegfactor = 16;       /* ratio of negative entries */
 175 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 176     "Ratio of namecache negative entries");
 177
 178 static int      nclockwarn;             /* warn on locked entries in ticks */
 179 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 180     "Warn on locked namecache entries in ticks");
 181
 182 static int      numdefered;             /* number of cache entries allocated */
 183 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 184     "Number of cache entries allocated");
 185
 186 static int      ncposlimit;             /* number of cache entries allocated */
 187 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 188     "Number of cache entries allocated");
 189
 190 static int      ncp_shared_lock_disable = 0;
 191 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 192            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 193
 194 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 195     "sizeof(struct vnode)");
 196 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 197     "sizeof(struct namecache)");
 198
 199 static int      ncmount_cache_enable = 1;
 200 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 201            &ncmount_cache_enable, 0, "mount point cache");
 202 static long     ncmount_cache_hit;
 203 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW,
 204             &ncmount_cache_hit, 0, "mpcache hits");
 205 static long     ncmount_cache_miss;
 206 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW,
 207             &ncmount_cache_miss, 0, "mpcache misses");
 208 static long     ncmount_cache_overwrite;
 209 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
 210             &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
 211
 212 static int cache_resolve_mp(struct mount *mp);
 213 static struct vnode *cache_dvpref(struct namecache *ncp);
 214 static void _cache_lock(struct namecache *ncp);
 215 static void _cache_setunresolved(struct namecache *ncp);
 216 static void _cache_cleanneg(int count);
 217 static void _cache_cleanpos(int count);
 218 static void _cache_cleandefered(void);
 219 static void _cache_unlink(struct namecache *ncp);
 220
 221 /*
 222  * The new name cache statistics
 223  */
 224 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 225 static int numneg;
 226 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
 227     "Number of negative namecache entries");
 228 static int numcache;
 229 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
 230     "Number of namecaches entries");
 231 static u_long numcalls;
 232 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
 233     "Number of namecache lookups");
 234 static u_long numchecks;
 235 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
 236     "Number of checked entries in namecache lookups");
 237
 238 struct nchstats nchstats[SMP_MAXCPU];
 239 /*
 240  * Export VFS cache effectiveness statistics to user-land.
 241  *
 242  * The statistics are left for aggregation to user-land so
 243  * neat things can be achieved, like observing per-CPU cache
 244  * distribution.
 245  */
 246 static int
 247 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 248 {
 249         struct globaldata *gd;
 250         int i, error;
 251
 252         error = 0;
 253         for (i = 0; i < ncpus; ++i) {
 254                 gd = globaldata_find(i);
 255                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 256                         sizeof(struct nchstats))))
 257                         break;
 258         }
 259
 260         return (error);
 261 }
 262 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 263   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 264
 265 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 266
 267 /*
 268  * Namespace locking.  The caller must already hold a reference to the
 269  * namecache structure in order to lock/unlock it.  This function prevents
 270  * the namespace from being created or destroyed by accessors other then
 271  * the lock holder.
 272  *
 273  * Note that holding a locked namecache structure prevents other threads
 274  * from making namespace changes (e.g. deleting or creating), prevents
 275  * vnode association state changes by other threads, and prevents the
 276  * namecache entry from being resolved or unresolved by other threads.
 277  *
 278  * An exclusive lock owner has full authority to associate/disassociate
 279  * vnodes and resolve/unresolve the locked ncp.
 280  *
 281  * A shared lock owner only has authority to acquire the underlying vnode,
 282  * if any.
 283  *
 284  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 285  * fact (when locking) or cleared prior to unlocking.
 286  *
 287  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 288  *           or recycled, but it does NOT help you if the vnode had already
 289  *           initiated a recyclement.  If this is important, use cache_get()
 290  *           rather then cache_lock() (and deal with the differences in the
 291  *           way the refs counter is handled).  Or, alternatively, make an
 292  *           unconditional call to cache_validate() or cache_resolve()
 293  *           after cache_lock() returns.
 294  */
 295 static
 296 void
 297 _cache_lock(struct namecache *ncp)
 298 {
 299         thread_t td;
 300         int didwarn;
 301         int begticks;
 302         int error;
 303         u_int count;
 304
 305         KKASSERT(ncp->nc_refs != 0);
 306         didwarn = 0;
 307         begticks = 0;
 308         td = curthread;
 309
 310         for (;;) {
 311                 count = ncp->nc_lockstatus;
 312                 cpu_ccfence();
 313
 314                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 315                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 316                                               count, count + 1)) {
 317                                 /*
 318                                  * The vp associated with a locked ncp must
 319                                  * be held to prevent it from being recycled.
 320                                  *
 321                                  * WARNING!  If VRECLAIMED is set the vnode
 322                                  * could already be in the middle of a recycle.
 323                                  * Callers must use cache_vref() or
 324                                  * cache_vget() on the locked ncp to
 325                                  * validate the vp or set the cache entry
 326                                  * to unresolved.
 327                                  *
 328                                  * NOTE! vhold() is allowed if we hold a
 329                                  *       lock on the ncp (which we do).
 330                                  */
 331                                 ncp->nc_locktd = td;
 332                                 if (ncp->nc_vp)
 333                                         vhold(ncp->nc_vp);
 334                                 break;
 335                         }
 336                         /* cmpset failed */
 337                         continue;
 338                 }
 339                 if (ncp->nc_locktd == td) {
 340                         KKASSERT((count & NC_SHLOCK_FLAG) == 0);
 341                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 342                                               count, count + 1)) {
 343                                 break;
 344                         }
 345                         /* cmpset failed */
 346                         continue;
 347                 }
 348                 tsleep_interlock(&ncp->nc_locktd, 0);
 349                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 350                                       count | NC_EXLOCK_REQ) == 0) {
 351                         /* cmpset failed */
 352                         continue;
 353                 }
 354                 if (begticks == 0)
 355                         begticks = ticks;
 356                 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
 357                                "clock", nclockwarn);
 358                 if (error == EWOULDBLOCK) {
 359                         if (didwarn == 0) {
 360                                 didwarn = ticks;
 361                                 kprintf("[diagnostic] cache_lock: "
 362                                         "blocked on %p %08x",
 363                                         ncp, count);
 364                                 kprintf(" \"%*.*s\"\n",
 365                                         ncp->nc_nlen, ncp->nc_nlen,
 366                                         ncp->nc_name);
 367                         }
 368                 }
 369                 /* loop */
 370         }
 371         if (didwarn) {
 372                 kprintf("[diagnostic] cache_lock: unblocked %*.*s after "
 373                         "%d secs\n",
 374                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 375                         (int)(ticks + (hz / 2) - begticks) / hz);
 376         }
 377 }
 378
 379 /*
 380  * The shared lock works similarly to the exclusive lock except
 381  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 382  * prevent vhold() races, since the moment our cmpset_int succeeds
 383  * another cpu can come in and get its own shared lock.
 384  *
 385  * A critical section is needed to prevent interruption during the
 386  * VHOLD interlock.
 387  */
 388 static
 389 void
 390 _cache_lock_shared(struct namecache *ncp)
 391 {
 392         int didwarn;
 393         int error;
 394         u_int count;
 395         u_int optreq = NC_EXLOCK_REQ;
 396
 397         KKASSERT(ncp->nc_refs != 0);
 398         didwarn = 0;
 399
 400         for (;;) {
 401                 count = ncp->nc_lockstatus;
 402                 cpu_ccfence();
 403
 404                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 405                         crit_enter();
 406                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 407                                       count,
 408                                       (count + 1) | NC_SHLOCK_FLAG |
 409                                                     NC_SHLOCK_VHOLD)) {
 410                                 /*
 411                                  * The vp associated with a locked ncp must
 412                                  * be held to prevent it from being recycled.
 413                                  *
 414                                  * WARNING!  If VRECLAIMED is set the vnode
 415                                  * could already be in the middle of a recycle.
 416                                  * Callers must use cache_vref() or
 417                                  * cache_vget() on the locked ncp to
 418                                  * validate the vp or set the cache entry
 419                                  * to unresolved.
 420                                  *
 421                                  * NOTE! vhold() is allowed if we hold a
 422                                  *       lock on the ncp (which we do).
 423                                  */
 424                                 if (ncp->nc_vp)
 425                                         vhold(ncp->nc_vp);
 426                                 atomic_clear_int(&ncp->nc_lockstatus,
 427                                                  NC_SHLOCK_VHOLD);
 428                                 crit_exit();
 429                                 break;
 430                         }
 431                         /* cmpset failed */
 432                         crit_exit();
 433                         continue;
 434                 }
 435
 436                 /*
 437                  * If already held shared we can just bump the count, but
 438                  * only allow this if nobody is trying to get the lock
 439                  * exclusively.  If we are blocking too long ignore excl
 440                  * requests (which can race/deadlock us).
 441                  *
 442                  * VHOLD is a bit of a hack.  Even though we successfully
 443                  * added another shared ref, the cpu that got the first
 444                  * shared ref might not yet have held the vnode.
 445                  */
 446                 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) {
 447                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 448                                             NC_SHLOCK_REQ |
 449                                             NC_SHLOCK_FLAG)) > 0);
 450                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 451                                               count, count + 1)) {
 452                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 453                                         cpu_pause();
 454                                 break;
 455                         }
 456                         continue;
 457                 }
 458                 tsleep_interlock(ncp, 0);
 459                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 460                                       count | NC_SHLOCK_REQ) == 0) {
 461                         /* cmpset failed */
 462                         continue;
 463                 }
 464                 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
 465                 if (error == EWOULDBLOCK) {
 466                         optreq = 0;
 467                         if (didwarn == 0) {
 468                                 didwarn = ticks;
 469                                 kprintf("[diagnostic] cache_lock_shared: "
 470                                         "blocked on %p %08x",
 471                                         ncp, count);
 472                                 kprintf(" \"%*.*s\"\n",
 473                                         ncp->nc_nlen, ncp->nc_nlen,
 474                                         ncp->nc_name);
 475                         }
 476                 }
 477                 /* loop */
 478         }
 479         if (didwarn) {
 480                 kprintf("[diagnostic] cache_lock_shared: "
 481                         "unblocked %*.*s after %d secs\n",
 482                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 483                         (int)(ticks - didwarn) / hz);
 484         }
 485 }
 486
 487 /*
 488  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
 489  *       such as the case where one of its children is locked.
 490  */
 491 static
 492 int
 493 _cache_lock_nonblock(struct namecache *ncp)
 494 {
 495         thread_t td;
 496         u_int count;
 497
 498         td = curthread;
 499
 500         for (;;) {
 501                 count = ncp->nc_lockstatus;
 502
 503                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 504                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 505                                               count, count + 1)) {
 506                                 /*
 507                                  * The vp associated with a locked ncp must
 508                                  * be held to prevent it from being recycled.
 509                                  *
 510                                  * WARNING!  If VRECLAIMED is set the vnode
 511                                  * could already be in the middle of a recycle.
 512                                  * Callers must use cache_vref() or
 513                                  * cache_vget() on the locked ncp to
 514                                  * validate the vp or set the cache entry
 515                                  * to unresolved.
 516                                  *
 517                                  * NOTE! vhold() is allowed if we hold a
 518                                  *       lock on the ncp (which we do).
 519                                  */
 520                                 ncp->nc_locktd = td;
 521                                 if (ncp->nc_vp)
 522                                         vhold(ncp->nc_vp);
 523                                 break;
 524                         }
 525                         /* cmpset failed */
 526                         continue;
 527                 }
 528                 if (ncp->nc_locktd == td) {
 529                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 530                                               count, count + 1)) {
 531                                 break;
 532                         }
 533                         /* cmpset failed */
 534                         continue;
 535                 }
 536                 return(EWOULDBLOCK);
 537         }
 538         return(0);
 539 }
 540
 541 /*
 542  * The shared lock works similarly to the exclusive lock except
 543  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 544  * prevent vhold() races, since the moment our cmpset_int succeeds
 545  * another cpu can come in and get its own shared lock.
 546  *
 547  * A critical section is needed to prevent interruption during the
 548  * VHOLD interlock.
 549  */
 550 static
 551 int
 552 _cache_lock_shared_nonblock(struct namecache *ncp)
 553 {
 554         u_int count;
 555
 556         for (;;) {
 557                 count = ncp->nc_lockstatus;
 558
 559                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 560                         crit_enter();
 561                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 562                                       count,
 563                                       (count + 1) | NC_SHLOCK_FLAG |
 564                                                     NC_SHLOCK_VHOLD)) {
 565                                 /*
 566                                  * The vp associated with a locked ncp must
 567                                  * be held to prevent it from being recycled.
 568                                  *
 569                                  * WARNING!  If VRECLAIMED is set the vnode
 570                                  * could already be in the middle of a recycle.
 571                                  * Callers must use cache_vref() or
 572                                  * cache_vget() on the locked ncp to
 573                                  * validate the vp or set the cache entry
 574                                  * to unresolved.
 575                                  *
 576                                  * NOTE! vhold() is allowed if we hold a
 577                                  *       lock on the ncp (which we do).
 578                                  */
 579                                 if (ncp->nc_vp)
 580                                         vhold(ncp->nc_vp);
 581                                 atomic_clear_int(&ncp->nc_lockstatus,
 582                                                  NC_SHLOCK_VHOLD);
 583                                 crit_exit();
 584                                 break;
 585                         }
 586                         /* cmpset failed */
 587                         crit_exit();
 588                         continue;
 589                 }
 590
 591                 /*
 592                  * If already held shared we can just bump the count, but
 593                  * only allow this if nobody is trying to get the lock
 594                  * exclusively.
 595                  *
 596                  * VHOLD is a bit of a hack.  Even though we successfully
 597                  * added another shared ref, the cpu that got the first
 598                  * shared ref might not yet have held the vnode.
 599                  */
 600                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
 601                     NC_SHLOCK_FLAG) {
 602                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 603                                             NC_SHLOCK_REQ |
 604                                             NC_SHLOCK_FLAG)) > 0);
 605                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 606                                               count, count + 1)) {
 607                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 608                                         cpu_pause();
 609                                 break;
 610                         }
 611                         continue;
 612                 }
 613                 return(EWOULDBLOCK);
 614         }
 615         return(0);
 616 }
 617
 618 /*
 619  * Helper function
 620  *
 621  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
 622  *
 623  *       nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
 624  */
 625 static
 626 void
 627 _cache_unlock(struct namecache *ncp)
 628 {
 629         thread_t td __debugvar = curthread;
 630         u_int count;
 631         u_int ncount;
 632         struct vnode *dropvp;
 633
 634         KKASSERT(ncp->nc_refs >= 0);
 635         KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
 636         KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
 637
 638         count = ncp->nc_lockstatus;
 639         cpu_ccfence();
 640
 641         /*
 642          * Clear nc_locktd prior to the atomic op (excl lock only)
 643          */
 644         if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
 645                 ncp->nc_locktd = NULL;
 646         dropvp = NULL;
 647
 648         for (;;) {
 649                 if ((count &
 650                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
 651                         dropvp = ncp->nc_vp;
 652                         if (count & NC_EXLOCK_REQ)
 653                                 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
 654                         else
 655                                 ncount = 0;
 656
 657                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 658                                               count, ncount)) {
 659                                 if (count & NC_EXLOCK_REQ)
 660                                         wakeup(&ncp->nc_locktd);
 661                                 else if (count & NC_SHLOCK_REQ)
 662                                         wakeup(ncp);
 663                                 break;
 664                         }
 665                         dropvp = NULL;
 666                 } else {
 667                         KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
 668                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 669                                             NC_SHLOCK_REQ |
 670                                             NC_SHLOCK_FLAG)) > 1);
 671                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 672                                               count, count - 1)) {
 673                                 break;
 674                         }
 675                 }
 676                 count = ncp->nc_lockstatus;
 677                 cpu_ccfence();
 678         }
 679
 680         /*
 681          * Don't actually drop the vp until we successfully clean out
 682          * the lock, otherwise we may race another shared lock.
 683          */
 684         if (dropvp)
 685                 vdrop(dropvp);
 686 }
 687
 688 static
 689 int
 690 _cache_lockstatus(struct namecache *ncp)
 691 {
 692         if (ncp->nc_locktd == curthread)
 693                 return(LK_EXCLUSIVE);
 694         if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
 695                 return(LK_SHARED);
 696         return(-1);
 697 }
 698
 699 /*
 700  * cache_hold() and cache_drop() prevent the premature deletion of a
 701  * namecache entry but do not prevent operations (such as zapping) on
 702  * that namecache entry.
 703  *
 704  * This routine may only be called from outside this source module if
 705  * nc_refs is already at least 1.
 706  *
 707  * This is a rare case where callers are allowed to hold a spinlock,
 708  * so we can't ourselves.
 709  */
 710 static __inline
 711 struct namecache *
 712 _cache_hold(struct namecache *ncp)
 713 {
 714         atomic_add_int(&ncp->nc_refs, 1);
 715         return(ncp);
 716 }
 717
 718 /*
 719  * Drop a cache entry, taking care to deal with races.
 720  *
 721  * For potential 1->0 transitions we must hold the ncp lock to safely
 722  * test its flags.  An unresolved entry with no children must be zapped
 723  * to avoid leaks.
 724  *
 725  * The call to cache_zap() itself will handle all remaining races and
 726  * will decrement the ncp's refs regardless.  If we are resolved or
 727  * have children nc_refs can safely be dropped to 0 without having to
 728  * zap the entry.
 729  *
 730  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
 731  *
 732  * NOTE: cache_zap() may return a non-NULL referenced parent which must
 733  *       be dropped in a loop.
 734  */
 735 static __inline
 736 void
 737 _cache_drop(struct namecache *ncp)
 738 {
 739         int refs;
 740
 741         while (ncp) {
 742                 KKASSERT(ncp->nc_refs > 0);
 743                 refs = ncp->nc_refs;
 744
 745                 if (refs == 1) {
 746                         if (_cache_lock_nonblock(ncp) == 0) {
 747                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 748                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
 749                                     TAILQ_EMPTY(&ncp->nc_list)) {
 750                                         ncp = cache_zap(ncp, 1);
 751                                         continue;
 752                                 }
 753                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
 754                                         _cache_unlock(ncp);
 755                                         break;
 756                                 }
 757                                 _cache_unlock(ncp);
 758                         }
 759                 } else {
 760                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
 761                                 break;
 762                 }
 763                 cpu_pause();
 764         }
 765 }
 766
 767 /*
 768  * Link a new namecache entry to its parent and to the hash table.  Be
 769  * careful to avoid races if vhold() blocks in the future.
 770  *
 771  * Both ncp and par must be referenced and locked.
 772  *
 773  * NOTE: The hash table spinlock is held during this call, we can't do
 774  *       anything fancy.
 775  */
 776 static void
 777 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 778                    struct nchash_head *nchpp)
 779 {
 780         KKASSERT(ncp->nc_parent == NULL);
 781         ncp->nc_parent = par;
 782         ncp->nc_head = nchpp;
 783
 784         /*
 785          * Set inheritance flags.  Note that the parent flags may be
 786          * stale due to getattr potentially not having been run yet
 787          * (it gets run during nlookup()'s).
 788          */
 789         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 790         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 791                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 792         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 793                 ncp->nc_flag |= NCF_UF_PCACHE;
 794
 795         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 796
 797         if (TAILQ_EMPTY(&par->nc_list)) {
 798                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 799                 /*
 800                  * Any vp associated with an ncp which has children must
 801                  * be held to prevent it from being recycled.
 802                  */
 803                 if (par->nc_vp)
 804                         vhold(par->nc_vp);
 805         } else {
 806                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 807         }
 808 }
 809
 810 /*
 811  * Remove the parent and hash associations from a namecache structure.
 812  * If this is the last child of the parent the cache_drop(par) will
 813  * attempt to recursively zap the parent.
 814  *
 815  * ncp must be locked.  This routine will acquire a temporary lock on
 816  * the parent as wlel as the appropriate hash chain.
 817  */
 818 static void
 819 _cache_unlink_parent(struct namecache *ncp)
 820 {
 821         struct namecache *par;
 822         struct vnode *dropvp;
 823
 824         if ((par = ncp->nc_parent) != NULL) {
 825                 KKASSERT(ncp->nc_parent == par);
 826                 _cache_hold(par);
 827                 _cache_lock(par);
 828                 spin_lock(&ncp->nc_head->spin);
 829                 LIST_REMOVE(ncp, nc_hash);
 830                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 831                 dropvp = NULL;
 832                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 833                         dropvp = par->nc_vp;
 834                 spin_unlock(&ncp->nc_head->spin);
 835                 ncp->nc_parent = NULL;
 836                 ncp->nc_head = NULL;
 837                 _cache_unlock(par);
 838                 _cache_drop(par);
 839
 840                 /*
 841                  * We can only safely vdrop with no spinlocks held.
 842                  */
 843                 if (dropvp)
 844                         vdrop(dropvp);
 845         }
 846 }
 847
 848 /*
 849  * Allocate a new namecache structure.  Most of the code does not require
 850  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 851  */
 852 static struct namecache *
 853 cache_alloc(int nlen)
 854 {
 855         struct namecache *ncp;
 856
 857         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 858         if (nlen)
 859                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 860         ncp->nc_nlen = nlen;
 861         ncp->nc_flag = NCF_UNRESOLVED;
 862         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 863         ncp->nc_refs = 1;
 864
 865         TAILQ_INIT(&ncp->nc_list);
 866         _cache_lock(ncp);
 867         return(ncp);
 868 }
 869
 870 /*
 871  * Can only be called for the case where the ncp has never been
 872  * associated with anything (so no spinlocks are needed).
 873  */
 874 static void
 875 _cache_free(struct namecache *ncp)
 876 {
 877         KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
 878         if (ncp->nc_name)
 879                 kfree(ncp->nc_name, M_VFSCACHE);
 880         kfree(ncp, M_VFSCACHE);
 881 }
 882
 883 /*
 884  * [re]initialize a nchandle.
 885  */
 886 void
 887 cache_zero(struct nchandle *nch)
 888 {
 889         nch->ncp = NULL;
 890         nch->mount = NULL;
 891 }
 892
 893 /*
 894  * Ref and deref a namecache structure.
 895  *
 896  * The caller must specify a stable ncp pointer, typically meaning the
 897  * ncp is already referenced but this can also occur indirectly through
 898  * e.g. holding a lock on a direct child.
 899  *
 900  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 901  *          use read spinlocks here.
 902  *
 903  * MPSAFE if nch is
 904  */
 905 struct nchandle *
 906 cache_hold(struct nchandle *nch)
 907 {
 908         _cache_hold(nch->ncp);
 909         atomic_add_int(&nch->mount->mnt_refs, 1);
 910         return(nch);
 911 }
 912
 913 /*
 914  * Create a copy of a namecache handle for an already-referenced
 915  * entry.
 916  *
 917  * MPSAFE if nch is
 918  */
 919 void
 920 cache_copy(struct nchandle *nch, struct nchandle *target)
 921 {
 922         *target = *nch;
 923         if (target->ncp)
 924                 _cache_hold(target->ncp);
 925         atomic_add_int(&nch->mount->mnt_refs, 1);
 926 }
 927
 928 /*
 929  * MPSAFE if nch is
 930  */
 931 void
 932 cache_changemount(struct nchandle *nch, struct mount *mp)
 933 {
 934         atomic_add_int(&nch->mount->mnt_refs, -1);
 935         nch->mount = mp;
 936         atomic_add_int(&nch->mount->mnt_refs, 1);
 937 }
 938
 939 void
 940 cache_drop(struct nchandle *nch)
 941 {
 942         atomic_add_int(&nch->mount->mnt_refs, -1);
 943         _cache_drop(nch->ncp);
 944         nch->ncp = NULL;
 945         nch->mount = NULL;
 946 }
 947
 948 int
 949 cache_lockstatus(struct nchandle *nch)
 950 {
 951         return(_cache_lockstatus(nch->ncp));
 952 }
 953
 954 void
 955 cache_lock(struct nchandle *nch)
 956 {
 957         _cache_lock(nch->ncp);
 958 }
 959
 960 void
 961 cache_lock_maybe_shared(struct nchandle *nch, int excl)
 962 {
 963         struct namecache *ncp = nch->ncp;
 964
 965         if (ncp_shared_lock_disable || excl ||
 966             (ncp->nc_flag & NCF_UNRESOLVED)) {
 967                 _cache_lock(ncp);
 968         } else {
 969                 _cache_lock_shared(ncp);
 970                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 971                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
 972                                 _cache_unlock(ncp);
 973                                 _cache_lock(ncp);
 974                         }
 975                 } else {
 976                         _cache_unlock(ncp);
 977                         _cache_lock(ncp);
 978                 }
 979         }
 980 }
 981
 982 /*
 983  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
 984  * is responsible for checking both for validity on return as they
 985  * may have become invalid.
 986  *
 987  * We have to deal with potential deadlocks here, just ping pong
 988  * the lock until we get it (we will always block somewhere when
 989  * looping so this is not cpu-intensive).
 990  *
 991  * which = 0    nch1 not locked, nch2 is locked
 992  * which = 1    nch1 is locked, nch2 is not locked
 993  */
 994 void
 995 cache_relock(struct nchandle *nch1, struct ucred *cred1,
 996              struct nchandle *nch2, struct ucred *cred2)
 997 {
 998         int which;
 999
1000         which = 0;
1001
1002         for (;;) {
1003                 if (which == 0) {
1004                         if (cache_lock_nonblock(nch1) == 0) {
1005                                 cache_resolve(nch1, cred1);
1006                                 break;
1007                         }
1008                         cache_unlock(nch2);
1009                         cache_lock(nch1);
1010                         cache_resolve(nch1, cred1);
1011                         which = 1;
1012                 } else {
1013                         if (cache_lock_nonblock(nch2) == 0) {
1014                                 cache_resolve(nch2, cred2);
1015                                 break;
1016                         }
1017                         cache_unlock(nch1);
1018                         cache_lock(nch2);
1019                         cache_resolve(nch2, cred2);
1020                         which = 0;
1021                 }
1022         }
1023 }
1024
1025 int
1026 cache_lock_nonblock(struct nchandle *nch)
1027 {
1028         return(_cache_lock_nonblock(nch->ncp));
1029 }
1030
1031 void
1032 cache_unlock(struct nchandle *nch)
1033 {
1034         _cache_unlock(nch->ncp);
1035 }
1036
1037 /*
1038  * ref-and-lock, unlock-and-deref functions.
1039  *
1040  * This function is primarily used by nlookup.  Even though cache_lock
1041  * holds the vnode, it is possible that the vnode may have already
1042  * initiated a recyclement.
1043  *
1044  * We want cache_get() to return a definitively usable vnode or a
1045  * definitively unresolved ncp.
1046  */
1047 static
1048 struct namecache *
1049 _cache_get(struct namecache *ncp)
1050 {
1051         _cache_hold(ncp);
1052         _cache_lock(ncp);
1053         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1054                 _cache_setunresolved(ncp);
1055         return(ncp);
1056 }
1057
1058 /*
1059  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1060  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1061  * valid.  Otherwise an exclusive lock will be acquired instead.
1062  */
1063 static
1064 struct namecache *
1065 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1066 {
1067         if (ncp_shared_lock_disable || excl ||
1068             (ncp->nc_flag & NCF_UNRESOLVED)) {
1069                 return(_cache_get(ncp));
1070         }
1071         _cache_hold(ncp);
1072         _cache_lock_shared(ncp);
1073         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1074                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1075                         _cache_unlock(ncp);
1076                         ncp = _cache_get(ncp);
1077                         _cache_drop(ncp);
1078                 }
1079         } else {
1080                 _cache_unlock(ncp);
1081                 ncp = _cache_get(ncp);
1082                 _cache_drop(ncp);
1083         }
1084         return(ncp);
1085 }
1086
1087 /*
1088  * This is a special form of _cache_lock() which only succeeds if
1089  * it can get a pristine, non-recursive lock.  The caller must have
1090  * already ref'd the ncp.
1091  *
1092  * On success the ncp will be locked, on failure it will not.  The
1093  * ref count does not change either way.
1094  *
1095  * We want _cache_lock_special() (on success) to return a definitively
1096  * usable vnode or a definitively unresolved ncp.
1097  */
1098 static int
1099 _cache_lock_special(struct namecache *ncp)
1100 {
1101         if (_cache_lock_nonblock(ncp) == 0) {
1102                 if ((ncp->nc_lockstatus &
1103                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
1104                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1105                                 _cache_setunresolved(ncp);
1106                         return(0);
1107                 }
1108                 _cache_unlock(ncp);
1109         }
1110         return(EWOULDBLOCK);
1111 }
1112
1113 /*
1114  * This function tries to get a shared lock but will back-off to an exclusive
1115  * lock if:
1116  *
1117  * (1) Some other thread is trying to obtain an exclusive lock
1118  *     (to prevent the exclusive requester from getting livelocked out
1119  *     by many shared locks).
1120  *
1121  * (2) The current thread already owns an exclusive lock (to avoid
1122  *     deadlocking).
1123  *
1124  * WARNING! On machines with lots of cores we really want to try hard to
1125  *          get a shared lock or concurrent path lookups can chain-react
1126  *          into a very high-latency exclusive lock.
1127  */
1128 static int
1129 _cache_lock_shared_special(struct namecache *ncp)
1130 {
1131         /*
1132          * Only honor a successful shared lock (returning 0) if there is
1133          * no exclusive request pending and the vnode, if present, is not
1134          * in a reclaimed state.
1135          */
1136         if (_cache_lock_shared_nonblock(ncp) == 0) {
1137                 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) {
1138                         if (ncp->nc_vp == NULL ||
1139                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
1140                                 return(0);
1141                         }
1142                 }
1143                 _cache_unlock(ncp);
1144                 return(EWOULDBLOCK);
1145         }
1146
1147         /*
1148          * Non-blocking shared lock failed.  If we already own the exclusive
1149          * lock just acquire another exclusive lock (instead of deadlocking).
1150          * Otherwise acquire a shared lock.
1151          */
1152         if (ncp->nc_locktd == curthread) {
1153                 _cache_lock(ncp);
1154                 return(0);
1155         }
1156         _cache_lock_shared(ncp);
1157         return(0);
1158 }
1159
1160
1161 /*
1162  * NOTE: The same nchandle can be passed for both arguments.
1163  */
1164 void
1165 cache_get(struct nchandle *nch, struct nchandle *target)
1166 {
1167         KKASSERT(nch->ncp->nc_refs > 0);
1168         target->mount = nch->mount;
1169         target->ncp = _cache_get(nch->ncp);
1170         atomic_add_int(&target->mount->mnt_refs, 1);
1171 }
1172
1173 void
1174 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1175 {
1176         KKASSERT(nch->ncp->nc_refs > 0);
1177         target->mount = nch->mount;
1178         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1179         atomic_add_int(&target->mount->mnt_refs, 1);
1180 }
1181
1182 /*
1183  *
1184  */
1185 static __inline
1186 void
1187 _cache_put(struct namecache *ncp)
1188 {
1189         _cache_unlock(ncp);
1190         _cache_drop(ncp);
1191 }
1192
1193 /*
1194  *
1195  */
1196 void
1197 cache_put(struct nchandle *nch)
1198 {
1199         atomic_add_int(&nch->mount->mnt_refs, -1);
1200         _cache_put(nch->ncp);
1201         nch->ncp = NULL;
1202         nch->mount = NULL;
1203 }
1204
1205 /*
1206  * Resolve an unresolved ncp by associating a vnode with it.  If the
1207  * vnode is NULL, a negative cache entry is created.
1208  *
1209  * The ncp should be locked on entry and will remain locked on return.
1210  */
1211 static
1212 void
1213 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1214 {
1215         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
1216         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1217
1218         if (vp != NULL) {
1219                 /*
1220                  * Any vp associated with an ncp which has children must
1221                  * be held.  Any vp associated with a locked ncp must be held.
1222                  */
1223                 if (!TAILQ_EMPTY(&ncp->nc_list))
1224                         vhold(vp);
1225                 spin_lock(&vp->v_spin);
1226                 ncp->nc_vp = vp;
1227                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1228                 spin_unlock(&vp->v_spin);
1229                 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1230                         vhold(vp);
1231
1232                 /*
1233                  * Set auxiliary flags
1234                  */
1235                 switch(vp->v_type) {
1236                 case VDIR:
1237                         ncp->nc_flag |= NCF_ISDIR;
1238                         break;
1239                 case VLNK:
1240                         ncp->nc_flag |= NCF_ISSYMLINK;
1241                         /* XXX cache the contents of the symlink */
1242                         break;
1243                 default:
1244                         break;
1245                 }
1246                 atomic_add_int(&numcache, 1);
1247                 ncp->nc_error = 0;
1248                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
1249                  * implementation*/
1250                 if (mp != NULL)
1251                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1252                                 vp->v_pfsmp = mp;
1253         } else {
1254                 /*
1255                  * When creating a negative cache hit we set the
1256                  * namecache_gen.  A later resolve will clean out the
1257                  * negative cache hit if the mount point's namecache_gen
1258                  * has changed.  Used by devfs, could also be used by
1259                  * other remote FSs.
1260                  */
1261                 ncp->nc_vp = NULL;
1262                 spin_lock(&ncspin);
1263                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
1264                 ++numneg;
1265                 spin_unlock(&ncspin);
1266                 ncp->nc_error = ENOENT;
1267                 if (mp)
1268                         VFS_NCPGEN_SET(mp, ncp);
1269         }
1270         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1271 }
1272
1273 /*
1274  *
1275  */
1276 void
1277 cache_setvp(struct nchandle *nch, struct vnode *vp)
1278 {
1279         _cache_setvp(nch->mount, nch->ncp, vp);
1280 }
1281
1282 /*
1283  *
1284  */
1285 void
1286 cache_settimeout(struct nchandle *nch, int nticks)
1287 {
1288         struct namecache *ncp = nch->ncp;
1289
1290         if ((ncp->nc_timeout = ticks + nticks) == 0)
1291                 ncp->nc_timeout = 1;
1292 }
1293
1294 /*
1295  * Disassociate the vnode or negative-cache association and mark a
1296  * namecache entry as unresolved again.  Note that the ncp is still
1297  * left in the hash table and still linked to its parent.
1298  *
1299  * The ncp should be locked and refd on entry and will remain locked and refd
1300  * on return.
1301  *
1302  * This routine is normally never called on a directory containing children.
1303  * However, NFS often does just that in its rename() code as a cop-out to
1304  * avoid complex namespace operations.  This disconnects a directory vnode
1305  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1306  * sync.
1307  *
1308  */
1309 static
1310 void
1311 _cache_setunresolved(struct namecache *ncp)
1312 {
1313         struct vnode *vp;
1314
1315         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1316                 ncp->nc_flag |= NCF_UNRESOLVED;
1317                 ncp->nc_timeout = 0;
1318                 ncp->nc_error = ENOTCONN;
1319                 if ((vp = ncp->nc_vp) != NULL) {
1320                         atomic_add_int(&numcache, -1);
1321                         spin_lock(&vp->v_spin);
1322                         ncp->nc_vp = NULL;
1323                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1324                         spin_unlock(&vp->v_spin);
1325
1326                         /*
1327                          * Any vp associated with an ncp with children is
1328                          * held by that ncp.  Any vp associated with a locked
1329                          * ncp is held by that ncp.  These conditions must be
1330                          * undone when the vp is cleared out from the ncp.
1331                          */
1332                         if (!TAILQ_EMPTY(&ncp->nc_list))
1333                                 vdrop(vp);
1334                         if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1335                                 vdrop(vp);
1336                 } else {
1337                         spin_lock(&ncspin);
1338                         TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
1339                         --numneg;
1340                         spin_unlock(&ncspin);
1341                 }
1342                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1343         }
1344 }
1345
1346 /*
1347  * The cache_nresolve() code calls this function to automatically
1348  * set a resolved cache element to unresolved if it has timed out
1349  * or if it is a negative cache hit and the mount point namecache_gen
1350  * has changed.
1351  */
1352 static __inline int
1353 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1354 {
1355         /*
1356          * Try to zap entries that have timed out.  We have
1357          * to be careful here because locked leafs may depend
1358          * on the vnode remaining intact in a parent, so only
1359          * do this under very specific conditions.
1360          */
1361         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1362             TAILQ_EMPTY(&ncp->nc_list)) {
1363                 return 1;
1364         }
1365
1366         /*
1367          * If a resolved negative cache hit is invalid due to
1368          * the mount's namecache generation being bumped, zap it.
1369          */
1370         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1371                 return 1;
1372         }
1373
1374         /*
1375          * Otherwise we are good
1376          */
1377         return 0;
1378 }
1379
1380 static __inline void
1381 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1382 {
1383         /*
1384          * Already in an unresolved state, nothing to do.
1385          */
1386         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1387                 if (_cache_auto_unresolve_test(mp, ncp))
1388                         _cache_setunresolved(ncp);
1389         }
1390 }
1391
1392 /*
1393  *
1394  */
1395 void
1396 cache_setunresolved(struct nchandle *nch)
1397 {
1398         _cache_setunresolved(nch->ncp);
1399 }
1400
1401 /*
1402  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1403  * looking for matches.  This flag tells the lookup code when it must
1404  * check for a mount linkage and also prevents the directories in question
1405  * from being deleted or renamed.
1406  */
1407 static
1408 int
1409 cache_clrmountpt_callback(struct mount *mp, void *data)
1410 {
1411         struct nchandle *nch = data;
1412
1413         if (mp->mnt_ncmounton.ncp == nch->ncp)
1414                 return(1);
1415         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1416                 return(1);
1417         return(0);
1418 }
1419
1420 /*
1421  *
1422  */
1423 void
1424 cache_clrmountpt(struct nchandle *nch)
1425 {
1426         int count;
1427
1428         count = mountlist_scan(cache_clrmountpt_callback, nch,
1429                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1430         if (count == 0)
1431                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1432 }
1433
1434 /*
1435  * Invalidate portions of the namecache topology given a starting entry.
1436  * The passed ncp is set to an unresolved state and:
1437  *
1438  * The passed ncp must be referencxed and locked.  The routine may unlock
1439  * and relock ncp several times, and will recheck the children and loop
1440  * to catch races.  When done the passed ncp will be returned with the
1441  * reference and lock intact.
1442  *
1443  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1444  *                        that the physical underlying nodes have been
1445  *                        destroyed... as in deleted.  For example, when
1446  *                        a directory is removed.  This will cause record
1447  *                        lookups on the name to no longer be able to find
1448  *                        the record and tells the resolver to return failure
1449  *                        rather then trying to resolve through the parent.
1450  *
1451  *                        The topology itself, including ncp->nc_name,
1452  *                        remains intact.
1453  *
1454  *                        This only applies to the passed ncp, if CINV_CHILDREN
1455  *                        is specified the children are not flagged.
1456  *
1457  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1458  *                        state as well.
1459  *
1460  *                        Note that this will also have the side effect of
1461  *                        cleaning out any unreferenced nodes in the topology
1462  *                        from the leaves up as the recursion backs out.
1463  *
1464  * Note that the topology for any referenced nodes remains intact, but
1465  * the nodes will be marked as having been destroyed and will be set
1466  * to an unresolved state.
1467  *
1468  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1469  * the namecache entry may not actually be invalidated on return if it was
1470  * revalidated while recursing down into its children.  This code guarentees
1471  * that the node(s) will go through an invalidation cycle, but does not
1472  * guarentee that they will remain in an invalidated state.
1473  *
1474  * Returns non-zero if a revalidation was detected during the invalidation
1475  * recursion, zero otherwise.  Note that since only the original ncp is
1476  * locked the revalidation ultimately can only indicate that the original ncp
1477  * *MIGHT* no have been reresolved.
1478  *
1479  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1480  * have to avoid blowing out the kernel stack.  We do this by saving the
1481  * deep namecache node and aborting the recursion, then re-recursing at that
1482  * node using a depth-first algorithm in order to allow multiple deep
1483  * recursions to chain through each other, then we restart the invalidation
1484  * from scratch.
1485  */
1486
1487 struct cinvtrack {
1488         struct namecache *resume_ncp;
1489         int depth;
1490 };
1491
1492 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1493
1494 static
1495 int
1496 _cache_inval(struct namecache *ncp, int flags)
1497 {
1498         struct cinvtrack track;
1499         struct namecache *ncp2;
1500         int r;
1501
1502         track.depth = 0;
1503         track.resume_ncp = NULL;
1504
1505         for (;;) {
1506                 r = _cache_inval_internal(ncp, flags, &track);
1507                 if (track.resume_ncp == NULL)
1508                         break;
1509                 kprintf("Warning: deep namecache recursion at %s\n",
1510                         ncp->nc_name);
1511                 _cache_unlock(ncp);
1512                 while ((ncp2 = track.resume_ncp) != NULL) {
1513                         track.resume_ncp = NULL;
1514                         _cache_lock(ncp2);
1515                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1516                                              &track);
1517                         _cache_put(ncp2);
1518                 }
1519                 _cache_lock(ncp);
1520         }
1521         return(r);
1522 }
1523
1524 int
1525 cache_inval(struct nchandle *nch, int flags)
1526 {
1527         return(_cache_inval(nch->ncp, flags));
1528 }
1529
1530 /*
1531  * Helper for _cache_inval().  The passed ncp is refd and locked and
1532  * remains that way on return, but may be unlocked/relocked multiple
1533  * times by the routine.
1534  */
1535 static int
1536 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1537 {
1538         struct namecache *kid;
1539         struct namecache *nextkid;
1540         int rcnt = 0;
1541
1542         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1543
1544         _cache_setunresolved(ncp);
1545         if (flags & CINV_DESTROY) {
1546                 ncp->nc_flag |= NCF_DESTROYED;
1547                 ++ncp->nc_generation;
1548         }
1549         if ((flags & CINV_CHILDREN) &&
1550             (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1551         ) {
1552                 _cache_hold(kid);
1553                 if (++track->depth > MAX_RECURSION_DEPTH) {
1554                         track->resume_ncp = ncp;
1555                         _cache_hold(ncp);
1556                         ++rcnt;
1557                 }
1558                 _cache_unlock(ncp);
1559                 while (kid) {
1560                         if (track->resume_ncp) {
1561                                 _cache_drop(kid);
1562                                 break;
1563                         }
1564                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1565                                 _cache_hold(nextkid);
1566                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1567                             TAILQ_FIRST(&kid->nc_list)
1568                         ) {
1569                                 _cache_lock(kid);
1570                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1571                                 _cache_unlock(kid);
1572                         }
1573                         _cache_drop(kid);
1574                         kid = nextkid;
1575                 }
1576                 --track->depth;
1577                 _cache_lock(ncp);
1578         }
1579
1580         /*
1581          * Someone could have gotten in there while ncp was unlocked,
1582          * retry if so.
1583          */
1584         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1585                 ++rcnt;
1586         return (rcnt);
1587 }
1588
1589 /*
1590  * Invalidate a vnode's namecache associations.  To avoid races against
1591  * the resolver we do not invalidate a node which we previously invalidated
1592  * but which was then re-resolved while we were in the invalidation loop.
1593  *
1594  * Returns non-zero if any namecache entries remain after the invalidation
1595  * loop completed.
1596  *
1597  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1598  *       be ripped out of the topology while held, the vnode's v_namecache
1599  *       list has no such restriction.  NCP's can be ripped out of the list
1600  *       at virtually any time if not locked, even if held.
1601  *
1602  *       In addition, the v_namecache list itself must be locked via
1603  *       the vnode's spinlock.
1604  */
1605 int
1606 cache_inval_vp(struct vnode *vp, int flags)
1607 {
1608         struct namecache *ncp;
1609         struct namecache *next;
1610
1611 restart:
1612         spin_lock(&vp->v_spin);
1613         ncp = TAILQ_FIRST(&vp->v_namecache);
1614         if (ncp)
1615                 _cache_hold(ncp);
1616         while (ncp) {
1617                 /* loop entered with ncp held and vp spin-locked */
1618                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1619                         _cache_hold(next);
1620                 spin_unlock(&vp->v_spin);
1621                 _cache_lock(ncp);
1622                 if (ncp->nc_vp != vp) {
1623                         kprintf("Warning: cache_inval_vp: race-A detected on "
1624                                 "%s\n", ncp->nc_name);
1625                         _cache_put(ncp);
1626                         if (next)
1627                                 _cache_drop(next);
1628                         goto restart;
1629                 }
1630                 _cache_inval(ncp, flags);
1631                 _cache_put(ncp);                /* also releases reference */
1632                 ncp = next;
1633                 spin_lock(&vp->v_spin);
1634                 if (ncp && ncp->nc_vp != vp) {
1635                         spin_unlock(&vp->v_spin);
1636                         kprintf("Warning: cache_inval_vp: race-B detected on "
1637                                 "%s\n", ncp->nc_name);
1638                         _cache_drop(ncp);
1639                         goto restart;
1640                 }
1641         }
1642         spin_unlock(&vp->v_spin);
1643         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1644 }
1645
1646 /*
1647  * This routine is used instead of the normal cache_inval_vp() when we
1648  * are trying to recycle otherwise good vnodes.
1649  *
1650  * Return 0 on success, non-zero if not all namecache records could be
1651  * disassociated from the vnode (for various reasons).
1652  */
1653 int
1654 cache_inval_vp_nonblock(struct vnode *vp)
1655 {
1656         struct namecache *ncp;
1657         struct namecache *next;
1658
1659         spin_lock(&vp->v_spin);
1660         ncp = TAILQ_FIRST(&vp->v_namecache);
1661         if (ncp)
1662                 _cache_hold(ncp);
1663         while (ncp) {
1664                 /* loop entered with ncp held */
1665                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1666                         _cache_hold(next);
1667                 spin_unlock(&vp->v_spin);
1668                 if (_cache_lock_nonblock(ncp)) {
1669                         _cache_drop(ncp);
1670                         if (next)
1671                                 _cache_drop(next);
1672                         goto done;
1673                 }
1674                 if (ncp->nc_vp != vp) {
1675                         kprintf("Warning: cache_inval_vp: race-A detected on "
1676                                 "%s\n", ncp->nc_name);
1677                         _cache_put(ncp);
1678                         if (next)
1679                                 _cache_drop(next);
1680                         goto done;
1681                 }
1682                 _cache_inval(ncp, 0);
1683                 _cache_put(ncp);                /* also releases reference */
1684                 ncp = next;
1685                 spin_lock(&vp->v_spin);
1686                 if (ncp && ncp->nc_vp != vp) {
1687                         spin_unlock(&vp->v_spin);
1688                         kprintf("Warning: cache_inval_vp: race-B detected on "
1689                                 "%s\n", ncp->nc_name);
1690                         _cache_drop(ncp);
1691                         goto done;
1692                 }
1693         }
1694         spin_unlock(&vp->v_spin);
1695 done:
1696         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1697 }
1698
1699 /*
1700  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1701  * must be locked.  The target ncp is destroyed (as a normal rename-over
1702  * would destroy the target file or directory).
1703  *
1704  * Because there may be references to the source ncp we cannot copy its
1705  * contents to the target.  Instead the source ncp is relinked as the target
1706  * and the target ncp is removed from the namecache topology.
1707  */
1708 void
1709 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1710 {
1711         struct namecache *fncp = fnch->ncp;
1712         struct namecache *tncp = tnch->ncp;
1713         struct namecache *tncp_par;
1714         struct nchash_head *nchpp;
1715         u_int32_t hash;
1716         char *oname;
1717         char *nname;
1718
1719         ++fncp->nc_generation;
1720         ++tncp->nc_generation;
1721         if (tncp->nc_nlen) {
1722                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1723                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1724                 nname[tncp->nc_nlen] = 0;
1725         } else {
1726                 nname = NULL;
1727         }
1728
1729         /*
1730          * Rename fncp (unlink)
1731          */
1732         _cache_unlink_parent(fncp);
1733         oname = fncp->nc_name;
1734         fncp->nc_name = nname;
1735         fncp->nc_nlen = tncp->nc_nlen;
1736         if (oname)
1737                 kfree(oname, M_VFSCACHE);
1738
1739         tncp_par = tncp->nc_parent;
1740         _cache_hold(tncp_par);
1741         _cache_lock(tncp_par);
1742
1743         /*
1744          * Rename fncp (relink)
1745          */
1746         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1747         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1748         nchpp = NCHHASH(hash);
1749
1750         spin_lock(&nchpp->spin);
1751         _cache_link_parent(fncp, tncp_par, nchpp);
1752         spin_unlock(&nchpp->spin);
1753
1754         _cache_put(tncp_par);
1755
1756         /*
1757          * Get rid of the overwritten tncp (unlink)
1758          */
1759         _cache_unlink(tncp);
1760 }
1761
1762 /*
1763  * Perform actions consistent with unlinking a file.  The passed-in ncp
1764  * must be locked.
1765  *
1766  * The ncp is marked DESTROYED so it no longer shows up in searches,
1767  * and will be physically deleted when the vnode goes away.
1768  *
1769  * If the related vnode has no refs then we cycle it through vget()/vput()
1770  * to (possibly if we don't have a ref race) trigger a deactivation,
1771  * allowing the VFS to trivially detect and recycle the deleted vnode
1772  * via VOP_INACTIVE().
1773  *
1774  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1775  *       target ncp.
1776  */
1777 void
1778 cache_unlink(struct nchandle *nch)
1779 {
1780         _cache_unlink(nch->ncp);
1781 }
1782
1783 static void
1784 _cache_unlink(struct namecache *ncp)
1785 {
1786         struct vnode *vp;
1787
1788         /*
1789          * Causes lookups to fail and allows another ncp with the same
1790          * name to be created under ncp->nc_parent.
1791          */
1792         ncp->nc_flag |= NCF_DESTROYED;
1793         ++ncp->nc_generation;
1794
1795         /*
1796          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
1797          * force action on the 1->0 transition.
1798          */
1799         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1800             (vp = ncp->nc_vp) != NULL) {
1801                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
1802                 if (VREFCNT(vp) <= 0) {
1803                         if (vget(vp, LK_SHARED) == 0)
1804                                 vput(vp);
1805                 }
1806         }
1807 }
1808
1809 /*
1810  * Return non-zero if the nch might be associated with an open and/or mmap()'d
1811  * file.  The easy solution is to just return non-zero if the vnode has refs.
1812  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1813  * force the reclaim).
1814  */
1815 int
1816 cache_isopen(struct nchandle *nch)
1817 {
1818         struct vnode *vp;
1819         struct namecache *ncp = nch->ncp;
1820
1821         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1822             (vp = ncp->nc_vp) != NULL &&
1823             VREFCNT(vp)) {
1824                 return 1;
1825         }
1826         return 0;
1827 }
1828
1829
1830 /*
1831  * vget the vnode associated with the namecache entry.  Resolve the namecache
1832  * entry if necessary.  The passed ncp must be referenced and locked.  If
1833  * the ncp is resolved it might be locked shared.
1834  *
1835  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
1836  * (depending on the passed lk_type) will be returned in *vpp with an error
1837  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
1838  * most typical error is ENOENT, meaning that the ncp represents a negative
1839  * cache hit and there is no vnode to retrieve, but other errors can occur
1840  * too.
1841  *
1842  * The vget() can race a reclaim.  If this occurs we re-resolve the
1843  * namecache entry.
1844  *
1845  * There are numerous places in the kernel where vget() is called on a
1846  * vnode while one or more of its namecache entries is locked.  Releasing
1847  * a vnode never deadlocks against locked namecache entries (the vnode
1848  * will not get recycled while referenced ncp's exist).  This means we
1849  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
1850  * lock when acquiring the vp lock or we might cause a deadlock.
1851  *
1852  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1853  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1854  *       relocked exclusively before being re-resolved.
1855  */
1856 int
1857 cache_vget(struct nchandle *nch, struct ucred *cred,
1858            int lk_type, struct vnode **vpp)
1859 {
1860         struct namecache *ncp;
1861         struct vnode *vp;
1862         int error;
1863
1864         ncp = nch->ncp;
1865 again:
1866         vp = NULL;
1867         if (ncp->nc_flag & NCF_UNRESOLVED)
1868                 error = cache_resolve(nch, cred);
1869         else
1870                 error = 0;
1871
1872         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1873                 error = vget(vp, lk_type);
1874                 if (error) {
1875                         /*
1876                          * VRECLAIM race
1877                          *
1878                          * The ncp may have been locked shared, we must relock
1879                          * it exclusively before we can set it to unresolved.
1880                          */
1881                         if (error == ENOENT) {
1882                                 kprintf("Warning: vnode reclaim race detected "
1883                                         "in cache_vget on %p (%s)\n",
1884                                         vp, ncp->nc_name);
1885                                 _cache_unlock(ncp);
1886                                 _cache_lock(ncp);
1887                                 _cache_setunresolved(ncp);
1888                                 goto again;
1889                         }
1890
1891                         /*
1892                          * Not a reclaim race, some other error.
1893                          */
1894                         KKASSERT(ncp->nc_vp == vp);
1895                         vp = NULL;
1896                 } else {
1897                         KKASSERT(ncp->nc_vp == vp);
1898                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1899                 }
1900         }
1901         if (error == 0 && vp == NULL)
1902                 error = ENOENT;
1903         *vpp = vp;
1904         return(error);
1905 }
1906
1907 /*
1908  * Similar to cache_vget() but only acquires a ref on the vnode.
1909  *
1910  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1911  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1912  *       relocked exclusively before being re-resolved.
1913  */
1914 int
1915 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1916 {
1917         struct namecache *ncp;
1918         struct vnode *vp;
1919         int error;
1920
1921         ncp = nch->ncp;
1922 again:
1923         vp = NULL;
1924         if (ncp->nc_flag & NCF_UNRESOLVED)
1925                 error = cache_resolve(nch, cred);
1926         else
1927                 error = 0;
1928
1929         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1930                 error = vget(vp, LK_SHARED);
1931                 if (error) {
1932                         /*
1933                          * VRECLAIM race
1934                          */
1935                         if (error == ENOENT) {
1936                                 kprintf("Warning: vnode reclaim race detected "
1937                                         "in cache_vget on %p (%s)\n",
1938                                         vp, ncp->nc_name);
1939                                 _cache_unlock(ncp);
1940                                 _cache_lock(ncp);
1941                                 _cache_setunresolved(ncp);
1942                                 goto again;
1943                         }
1944
1945                         /*
1946                          * Not a reclaim race, some other error.
1947                          */
1948                         KKASSERT(ncp->nc_vp == vp);
1949                         vp = NULL;
1950                 } else {
1951                         KKASSERT(ncp->nc_vp == vp);
1952                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1953                         /* caller does not want a lock */
1954                         vn_unlock(vp);
1955                 }
1956         }
1957         if (error == 0 && vp == NULL)
1958                 error = ENOENT;
1959         *vpp = vp;
1960         return(error);
1961 }
1962
1963 /*
1964  * Return a referenced vnode representing the parent directory of
1965  * ncp.
1966  *
1967  * Because the caller has locked the ncp it should not be possible for
1968  * the parent ncp to go away.  However, the parent can unresolve its
1969  * dvp at any time so we must be able to acquire a lock on the parent
1970  * to safely access nc_vp.
1971  *
1972  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
1973  * so use vhold()/vdrop() while holding the lock to prevent dvp from
1974  * getting destroyed.
1975  *
1976  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
1977  *       lock on the ncp in question..
1978  */
1979 static struct vnode *
1980 cache_dvpref(struct namecache *ncp)
1981 {
1982         struct namecache *par;
1983         struct vnode *dvp;
1984
1985         dvp = NULL;
1986         if ((par = ncp->nc_parent) != NULL) {
1987                 _cache_hold(par);
1988                 _cache_lock(par);
1989                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
1990                         if ((dvp = par->nc_vp) != NULL)
1991                                 vhold(dvp);
1992                 }
1993                 _cache_unlock(par);
1994                 if (dvp) {
1995                         if (vget(dvp, LK_SHARED) == 0) {
1996                                 vn_unlock(dvp);
1997                                 vdrop(dvp);
1998                                 /* return refd, unlocked dvp */
1999                         } else {
2000                                 vdrop(dvp);
2001                                 dvp = NULL;
2002                         }
2003                 }
2004                 _cache_drop(par);
2005         }
2006         return(dvp);
2007 }
2008
2009 /*
2010  * Convert a directory vnode to a namecache record without any other
2011  * knowledge of the topology.  This ONLY works with directory vnodes and
2012  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2013  * returned ncp (if not NULL) will be held and unlocked.
2014  *
2015  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2016  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2017  * for dvp.  This will fail only if the directory has been deleted out from
2018  * under the caller.
2019  *
2020  * Callers must always check for a NULL return no matter the value of 'makeit'.
2021  *
2022  * To avoid underflowing the kernel stack each recursive call increments
2023  * the makeit variable.
2024  */
2025
2026 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2027                                   struct vnode *dvp, char *fakename);
2028 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2029                                   struct vnode **saved_dvp);
2030
2031 int
2032 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2033               struct nchandle *nch)
2034 {
2035         struct vnode *saved_dvp;
2036         struct vnode *pvp;
2037         char *fakename;
2038         int error;
2039
2040         nch->ncp = NULL;
2041         nch->mount = dvp->v_mount;
2042         saved_dvp = NULL;
2043         fakename = NULL;
2044
2045         /*
2046          * Handle the makeit == 0 degenerate case
2047          */
2048         if (makeit == 0) {
2049                 spin_lock_shared(&dvp->v_spin);
2050                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2051                 if (nch->ncp)
2052                         cache_hold(nch);
2053                 spin_unlock_shared(&dvp->v_spin);
2054         }
2055
2056         /*
2057          * Loop until resolution, inside code will break out on error.
2058          */
2059         while (makeit) {
2060                 /*
2061                  * Break out if we successfully acquire a working ncp.
2062                  */
2063                 spin_lock_shared(&dvp->v_spin);
2064                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2065                 if (nch->ncp) {
2066                         cache_hold(nch);
2067                         spin_unlock_shared(&dvp->v_spin);
2068                         break;
2069                 }
2070                 spin_unlock_shared(&dvp->v_spin);
2071
2072                 /*
2073                  * If dvp is the root of its filesystem it should already
2074                  * have a namecache pointer associated with it as a side
2075                  * effect of the mount, but it may have been disassociated.
2076                  */
2077                 if (dvp->v_flag & VROOT) {
2078                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2079                         error = cache_resolve_mp(nch->mount);
2080                         _cache_put(nch->ncp);
2081                         if (ncvp_debug) {
2082                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2083                                         dvp->v_mount, error);
2084                         }
2085                         if (error) {
2086                                 if (ncvp_debug)
2087                                         kprintf(" failed\n");
2088                                 nch->ncp = NULL;
2089                                 break;
2090                         }
2091                         if (ncvp_debug)
2092                                 kprintf(" succeeded\n");
2093                         continue;
2094                 }
2095
2096                 /*
2097                  * If we are recursed too deeply resort to an O(n^2)
2098                  * algorithm to resolve the namecache topology.  The
2099                  * resolved pvp is left referenced in saved_dvp to
2100                  * prevent the tree from being destroyed while we loop.
2101                  */
2102                 if (makeit > 20) {
2103                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2104                         if (error) {
2105                                 kprintf("lookupdotdot(longpath) failed %d "
2106                                        "dvp %p\n", error, dvp);
2107                                 nch->ncp = NULL;
2108                                 break;
2109                         }
2110                         continue;
2111                 }
2112
2113                 /*
2114                  * Get the parent directory and resolve its ncp.
2115                  */
2116                 if (fakename) {
2117                         kfree(fakename, M_TEMP);
2118                         fakename = NULL;
2119                 }
2120                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2121                                           &fakename);
2122                 if (error) {
2123                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2124                         break;
2125                 }
2126                 vn_unlock(pvp);
2127
2128                 /*
2129                  * Reuse makeit as a recursion depth counter.  On success
2130                  * nch will be fully referenced.
2131                  */
2132                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2133                 vrele(pvp);
2134                 if (nch->ncp == NULL)
2135                         break;
2136
2137                 /*
2138                  * Do an inefficient scan of pvp (embodied by ncp) to look
2139                  * for dvp.  This will create a namecache record for dvp on
2140                  * success.  We loop up to recheck on success.
2141                  *
2142                  * ncp and dvp are both held but not locked.
2143                  */
2144                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2145                 if (error) {
2146                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2147                                 pvp, nch->ncp->nc_name, dvp);
2148                         cache_drop(nch);
2149                         /* nch was NULLed out, reload mount */
2150                         nch->mount = dvp->v_mount;
2151                         break;
2152                 }
2153                 if (ncvp_debug) {
2154                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2155                                 pvp, nch->ncp->nc_name);
2156                 }
2157                 cache_drop(nch);
2158                 /* nch was NULLed out, reload mount */
2159                 nch->mount = dvp->v_mount;
2160         }
2161
2162         /*
2163          * If nch->ncp is non-NULL it will have been held already.
2164          */
2165         if (fakename)
2166                 kfree(fakename, M_TEMP);
2167         if (saved_dvp)
2168                 vrele(saved_dvp);
2169         if (nch->ncp)
2170                 return (0);
2171         return (EINVAL);
2172 }
2173
2174 /*
2175  * Go up the chain of parent directories until we find something
2176  * we can resolve into the namecache.  This is very inefficient.
2177  */
2178 static
2179 int
2180 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2181                   struct vnode **saved_dvp)
2182 {
2183         struct nchandle nch;
2184         struct vnode *pvp;
2185         int error;
2186         static time_t last_fromdvp_report;
2187         char *fakename;
2188
2189         /*
2190          * Loop getting the parent directory vnode until we get something we
2191          * can resolve in the namecache.
2192          */
2193         vref(dvp);
2194         nch.mount = dvp->v_mount;
2195         nch.ncp = NULL;
2196         fakename = NULL;
2197
2198         for (;;) {
2199                 if (fakename) {
2200                         kfree(fakename, M_TEMP);
2201                         fakename = NULL;
2202                 }
2203                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2204                                           &fakename);
2205                 if (error) {
2206                         vrele(dvp);
2207                         break;
2208                 }
2209                 vn_unlock(pvp);
2210                 spin_lock_shared(&pvp->v_spin);
2211                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2212                         _cache_hold(nch.ncp);
2213                         spin_unlock_shared(&pvp->v_spin);
2214                         vrele(pvp);
2215                         break;
2216                 }
2217                 spin_unlock_shared(&pvp->v_spin);
2218                 if (pvp->v_flag & VROOT) {
2219                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2220                         error = cache_resolve_mp(nch.mount);
2221                         _cache_unlock(nch.ncp);
2222                         vrele(pvp);
2223                         if (error) {
2224                                 _cache_drop(nch.ncp);
2225                                 nch.ncp = NULL;
2226                                 vrele(dvp);
2227                         }
2228                         break;
2229                 }
2230                 vrele(dvp);
2231                 dvp = pvp;
2232         }
2233         if (error == 0) {
2234                 if (last_fromdvp_report != time_uptime) {
2235                         last_fromdvp_report = time_uptime;
2236                         kprintf("Warning: extremely inefficient path "
2237                                 "resolution on %s\n",
2238                                 nch.ncp->nc_name);
2239                 }
2240                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2241
2242                 /*
2243                  * Hopefully dvp now has a namecache record associated with
2244                  * it.  Leave it referenced to prevent the kernel from
2245                  * recycling the vnode.  Otherwise extremely long directory
2246                  * paths could result in endless recycling.
2247                  */
2248                 if (*saved_dvp)
2249                     vrele(*saved_dvp);
2250                 *saved_dvp = dvp;
2251                 _cache_drop(nch.ncp);
2252         }
2253         if (fakename)
2254                 kfree(fakename, M_TEMP);
2255         return (error);
2256 }
2257
2258 /*
2259  * Do an inefficient scan of the directory represented by ncp looking for
2260  * the directory vnode dvp.  ncp must be held but not locked on entry and
2261  * will be held on return.  dvp must be refd but not locked on entry and
2262  * will remain refd on return.
2263  *
2264  * Why do this at all?  Well, due to its stateless nature the NFS server
2265  * converts file handles directly to vnodes without necessarily going through
2266  * the namecache ops that would otherwise create the namecache topology
2267  * leading to the vnode.  We could either (1) Change the namecache algorithms
2268  * to allow disconnect namecache records that are re-merged opportunistically,
2269  * or (2) Make the NFS server backtrack and scan to recover a connected
2270  * namecache topology in order to then be able to issue new API lookups.
2271  *
2272  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2273  * namecache algorithms and introduces a lot of complication in every subsystem
2274  * that calls into the namecache to deal with the re-merge case, especially
2275  * since we are using the namecache to placehold negative lookups and the
2276  * vnode might not be immediately assigned. (2) is certainly far less
2277  * efficient then (1), but since we are only talking about directories here
2278  * (which are likely to remain cached), the case does not actually run all
2279  * that often and has the supreme advantage of not polluting the namecache
2280  * algorithms.
2281  *
2282  * If a fakename is supplied just construct a namecache entry using the
2283  * fake name.
2284  */
2285 static int
2286 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2287                        struct vnode *dvp, char *fakename)
2288 {
2289         struct nlcomponent nlc;
2290         struct nchandle rncp;
2291         struct dirent *den;
2292         struct vnode *pvp;
2293         struct vattr vat;
2294         struct iovec iov;
2295         struct uio uio;
2296         int blksize;
2297         int eofflag;
2298         int bytes;
2299         char *rbuf;
2300         int error;
2301
2302         vat.va_blocksize = 0;
2303         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2304                 return (error);
2305         cache_lock(nch);
2306         error = cache_vref(nch, cred, &pvp);
2307         cache_unlock(nch);
2308         if (error)
2309                 return (error);
2310         if (ncvp_debug) {
2311                 kprintf("inefficient_scan: directory iosize %ld "
2312                         "vattr fileid = %lld\n",
2313                         vat.va_blocksize,
2314                         (long long)vat.va_fileid);
2315         }
2316
2317         /*
2318          * Use the supplied fakename if not NULL.  Fake names are typically
2319          * not in the actual filesystem hierarchy.  This is used by HAMMER
2320          * to glue @@timestamp recursions together.
2321          */
2322         if (fakename) {
2323                 nlc.nlc_nameptr = fakename;
2324                 nlc.nlc_namelen = strlen(fakename);
2325                 rncp = cache_nlookup(nch, &nlc);
2326                 goto done;
2327         }
2328
2329         if ((blksize = vat.va_blocksize) == 0)
2330                 blksize = DEV_BSIZE;
2331         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2332         rncp.ncp = NULL;
2333
2334         eofflag = 0;
2335         uio.uio_offset = 0;
2336 again:
2337         iov.iov_base = rbuf;
2338         iov.iov_len = blksize;
2339         uio.uio_iov = &iov;
2340         uio.uio_iovcnt = 1;
2341         uio.uio_resid = blksize;
2342         uio.uio_segflg = UIO_SYSSPACE;
2343         uio.uio_rw = UIO_READ;
2344         uio.uio_td = curthread;
2345
2346         if (ncvp_debug >= 2)
2347                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2348         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2349         if (error == 0) {
2350                 den = (struct dirent *)rbuf;
2351                 bytes = blksize - uio.uio_resid;
2352
2353                 while (bytes > 0) {
2354                         if (ncvp_debug >= 2) {
2355                                 kprintf("cache_inefficient_scan: %*.*s\n",
2356                                         den->d_namlen, den->d_namlen,
2357                                         den->d_name);
2358                         }
2359                         if (den->d_type != DT_WHT &&
2360                             den->d_ino == vat.va_fileid) {
2361                                 if (ncvp_debug) {
2362                                         kprintf("cache_inefficient_scan: "
2363                                                "MATCHED inode %lld path %s/%*.*s\n",
2364                                                (long long)vat.va_fileid,
2365                                                nch->ncp->nc_name,
2366                                                den->d_namlen, den->d_namlen,
2367                                                den->d_name);
2368                                 }
2369                                 nlc.nlc_nameptr = den->d_name;
2370                                 nlc.nlc_namelen = den->d_namlen;
2371                                 rncp = cache_nlookup(nch, &nlc);
2372                                 KKASSERT(rncp.ncp != NULL);
2373                                 break;
2374                         }
2375                         bytes -= _DIRENT_DIRSIZ(den);
2376                         den = _DIRENT_NEXT(den);
2377                 }
2378                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2379                         goto again;
2380         }
2381         kfree(rbuf, M_TEMP);
2382 done:
2383         vrele(pvp);
2384         if (rncp.ncp) {
2385                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2386                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2387                         if (ncvp_debug >= 2) {
2388                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2389                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2390                         }
2391                 } else {
2392                         if (ncvp_debug >= 2) {
2393                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2394                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2395                                         rncp.ncp->nc_vp);
2396                         }
2397                 }
2398                 if (rncp.ncp->nc_vp == NULL)
2399                         error = rncp.ncp->nc_error;
2400                 /*
2401                  * Release rncp after a successful nlookup.  rncp was fully
2402                  * referenced.
2403                  */
2404                 cache_put(&rncp);
2405         } else {
2406                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2407                         dvp, nch->ncp->nc_name);
2408                 error = ENOENT;
2409         }
2410         return (error);
2411 }
2412
2413 /*
2414  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2415  * state, which disassociates it from its vnode or ncneglist.
2416  *
2417  * Then, if there are no additional references to the ncp and no children,
2418  * the ncp is removed from the topology and destroyed.
2419  *
2420  * References and/or children may exist if the ncp is in the middle of the
2421  * topology, preventing the ncp from being destroyed.
2422  *
2423  * This function must be called with the ncp held and locked and will unlock
2424  * and drop it during zapping.
2425  *
2426  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2427  * This case can occur in the cache_drop() path.
2428  *
2429  * This function may returned a held (but NOT locked) parent node which the
2430  * caller must drop.  We do this so _cache_drop() can loop, to avoid
2431  * blowing out the kernel stack.
2432  *
2433  * WARNING!  For MPSAFE operation this routine must acquire up to three
2434  *           spin locks to be able to safely test nc_refs.  Lock order is
2435  *           very important.
2436  *
2437  *           hash spinlock if on hash list
2438  *           parent spinlock if child of parent
2439  *           (the ncp is unresolved so there is no vnode association)
2440  */
2441 static struct namecache *
2442 cache_zap(struct namecache *ncp, int nonblock)
2443 {
2444         struct namecache *par;
2445         struct vnode *dropvp;
2446         int refs;
2447
2448         /*
2449          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2450          */
2451         _cache_setunresolved(ncp);
2452
2453         /*
2454          * Try to scrap the entry and possibly tail-recurse on its parent.
2455          * We only scrap unref'd (other then our ref) unresolved entries,
2456          * we do not scrap 'live' entries.
2457          *
2458          * Note that once the spinlocks are acquired if nc_refs == 1 no
2459          * other references are possible.  If it isn't, however, we have
2460          * to decrement but also be sure to avoid a 1->0 transition.
2461          */
2462         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2463         KKASSERT(ncp->nc_refs > 0);
2464
2465         /*
2466          * Acquire locks.  Note that the parent can't go away while we hold
2467          * a child locked.
2468          */
2469         if ((par = ncp->nc_parent) != NULL) {
2470                 if (nonblock) {
2471                         for (;;) {
2472                                 if (_cache_lock_nonblock(par) == 0)
2473                                         break;
2474                                 refs = ncp->nc_refs;
2475                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2476                                 ++numdefered;   /* MP race ok */
2477                                 if (atomic_cmpset_int(&ncp->nc_refs,
2478                                                       refs, refs - 1)) {
2479                                         _cache_unlock(ncp);
2480                                         return(NULL);
2481                                 }
2482                                 cpu_pause();
2483                         }
2484                         _cache_hold(par);
2485                 } else {
2486                         _cache_hold(par);
2487                         _cache_lock(par);
2488                 }
2489                 spin_lock(&ncp->nc_head->spin);
2490         }
2491
2492         /*
2493          * If someone other then us has a ref or we have children
2494          * we cannot zap the entry.  The 1->0 transition and any
2495          * further list operation is protected by the spinlocks
2496          * we have acquired but other transitions are not.
2497          */
2498         for (;;) {
2499                 refs = ncp->nc_refs;
2500                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2501                         break;
2502                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2503                         if (par) {
2504                                 spin_unlock(&ncp->nc_head->spin);
2505                                 _cache_put(par);
2506                         }
2507                         _cache_unlock(ncp);
2508                         return(NULL);
2509                 }
2510                 cpu_pause();
2511         }
2512
2513         /*
2514          * We are the only ref and with the spinlocks held no further
2515          * refs can be acquired by others.
2516          *
2517          * Remove us from the hash list and parent list.  We have to
2518          * drop a ref on the parent's vp if the parent's list becomes
2519          * empty.
2520          */
2521         dropvp = NULL;
2522         if (par) {
2523                 struct nchash_head *nchpp = ncp->nc_head;
2524
2525                 KKASSERT(nchpp != NULL);
2526                 LIST_REMOVE(ncp, nc_hash);
2527                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2528                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
2529                         dropvp = par->nc_vp;
2530                 ncp->nc_head = NULL;
2531                 ncp->nc_parent = NULL;
2532                 spin_unlock(&nchpp->spin);
2533                 _cache_unlock(par);
2534         } else {
2535                 KKASSERT(ncp->nc_head == NULL);
2536         }
2537
2538         /*
2539          * ncp should not have picked up any refs.  Physically
2540          * destroy the ncp.
2541          */
2542         KKASSERT(ncp->nc_refs == 1);
2543         /* _cache_unlock(ncp) not required */
2544         ncp->nc_refs = -1;      /* safety */
2545         if (ncp->nc_name)
2546                 kfree(ncp->nc_name, M_VFSCACHE);
2547         kfree(ncp, M_VFSCACHE);
2548
2549         /*
2550          * Delayed drop (we had to release our spinlocks)
2551          *
2552          * The refed parent (if not  NULL) must be dropped.  The
2553          * caller is responsible for looping.
2554          */
2555         if (dropvp)
2556                 vdrop(dropvp);
2557         return(par);
2558 }
2559
2560 /*
2561  * Clean up dangling negative cache and defered-drop entries in the
2562  * namecache.
2563  *
2564  * This routine is called in the critical path and also called from
2565  * vnlru().  When called from vnlru we use a lower limit to try to
2566  * deal with the negative cache before the critical path has to start
2567  * dealing with it.
2568  */
2569 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2570
2571 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2572 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2573
2574 void
2575 cache_hysteresis(int critpath)
2576 {
2577         int poslimit;
2578         int neglimit = desiredvnodes / ncnegfactor;
2579         int xnumcache = numcache;
2580
2581         if (critpath == 0)
2582                 neglimit = neglimit * 8 / 10;
2583
2584         /*
2585          * Don't cache too many negative hits.  We use hysteresis to reduce
2586          * the impact on the critical path.
2587          */
2588         switch(neg_cache_hysteresis_state[critpath]) {
2589         case CHI_LOW:
2590                 if (numneg > MINNEG && numneg > neglimit) {
2591                         if (critpath)
2592                                 _cache_cleanneg(ncnegflush);
2593                         else
2594                                 _cache_cleanneg(ncnegflush +
2595                                                 numneg - neglimit);
2596                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2597                 }
2598                 break;
2599         case CHI_HIGH:
2600                 if (numneg > MINNEG * 9 / 10 &&
2601                     numneg * 9 / 10 > neglimit
2602                 ) {
2603                         if (critpath)
2604                                 _cache_cleanneg(ncnegflush);
2605                         else
2606                                 _cache_cleanneg(ncnegflush +
2607                                                 numneg * 9 / 10 - neglimit);
2608                 } else {
2609                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
2610                 }
2611                 break;
2612         }
2613
2614         /*
2615          * Don't cache too many positive hits.  We use hysteresis to reduce
2616          * the impact on the critical path.
2617          *
2618          * Excessive positive hits can accumulate due to large numbers of
2619          * hardlinks (the vnode cache will not prevent hl ncps from growing
2620          * into infinity).
2621          */
2622         if ((poslimit = ncposlimit) == 0)
2623                 poslimit = desiredvnodes * 2;
2624         if (critpath == 0)
2625                 poslimit = poslimit * 8 / 10;
2626
2627         switch(pos_cache_hysteresis_state[critpath]) {
2628         case CHI_LOW:
2629                 if (xnumcache > poslimit && xnumcache > MINPOS) {
2630                         if (critpath)
2631                                 _cache_cleanpos(ncposflush);
2632                         else
2633                                 _cache_cleanpos(ncposflush +
2634                                                 xnumcache - poslimit);
2635                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2636                 }
2637                 break;
2638         case CHI_HIGH:
2639                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2640                         if (critpath)
2641                                 _cache_cleanpos(ncposflush);
2642                         else
2643                                 _cache_cleanpos(ncposflush +
2644                                                 xnumcache - poslimit * 5 / 6);
2645                 } else {
2646                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
2647                 }
2648                 break;
2649         }
2650
2651         /*
2652          * Clean out dangling defered-zap ncps which could not
2653          * be cleanly dropped if too many build up.  Note
2654          * that numdefered is not an exact number as such ncps
2655          * can be reused and the counter is not handled in a MP
2656          * safe manner by design.
2657          */
2658         if (numdefered > neglimit) {
2659                 _cache_cleandefered();
2660         }
2661 }
2662
2663 /*
2664  * NEW NAMECACHE LOOKUP API
2665  *
2666  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2667  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2668  * is ALWAYS returned, eve if the supplied component is illegal.
2669  *
2670  * The resulting namecache entry should be returned to the system with
2671  * cache_put() or cache_unlock() + cache_drop().
2672  *
2673  * namecache locks are recursive but care must be taken to avoid lock order
2674  * reversals (hence why the passed par_nch must be unlocked).  Locking
2675  * rules are to order for parent traversals, not for child traversals.
2676  *
2677  * Nobody else will be able to manipulate the associated namespace (e.g.
2678  * create, delete, rename, rename-target) until the caller unlocks the
2679  * entry.
2680  *
2681  * The returned entry will be in one of three states:  positive hit (non-null
2682  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2683  * Unresolved entries must be resolved through the filesystem to associate the
2684  * vnode and/or determine whether a positive or negative hit has occured.
2685  *
2686  * It is not necessary to lock a directory in order to lock namespace under
2687  * that directory.  In fact, it is explicitly not allowed to do that.  A
2688  * directory is typically only locked when being created, renamed, or
2689  * destroyed.
2690  *
2691  * The directory (par) may be unresolved, in which case any returned child
2692  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2693  * the filesystem lookup requires a resolved directory vnode the caller is
2694  * responsible for resolving the namecache chain top-down.  This API
2695  * specifically allows whole chains to be created in an unresolved state.
2696  */
2697 struct nchandle
2698 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2699 {
2700         struct nchandle nch;
2701         struct namecache *ncp;
2702         struct namecache *new_ncp;
2703         struct nchash_head *nchpp;
2704         struct mount *mp;
2705         u_int32_t hash;
2706         globaldata_t gd;
2707         int par_locked;
2708
2709         numcalls++;
2710         gd = mycpu;
2711         mp = par_nch->mount;
2712         par_locked = 0;
2713
2714         /*
2715          * This is a good time to call it, no ncp's are locked by
2716          * the caller or us.
2717          */
2718         cache_hysteresis(1);
2719
2720         /*
2721          * Try to locate an existing entry
2722          */
2723         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2724         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2725         new_ncp = NULL;
2726         nchpp = NCHHASH(hash);
2727 restart:
2728         if (new_ncp)
2729                 spin_lock(&nchpp->spin);
2730         else
2731                 spin_lock_shared(&nchpp->spin);
2732
2733         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2734                 numchecks++;
2735
2736                 /*
2737                  * Break out if we find a matching entry.  Note that
2738                  * UNRESOLVED entries may match, but DESTROYED entries
2739                  * do not.
2740                  */
2741                 if (ncp->nc_parent == par_nch->ncp &&
2742                     ncp->nc_nlen == nlc->nlc_namelen &&
2743                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2744                     (ncp->nc_flag & NCF_DESTROYED) == 0
2745                 ) {
2746                         _cache_hold(ncp);
2747                         if (new_ncp)
2748                                 spin_unlock(&nchpp->spin);
2749                         else
2750                                 spin_unlock_shared(&nchpp->spin);
2751                         if (par_locked) {
2752                                 _cache_unlock(par_nch->ncp);
2753                                 par_locked = 0;
2754                         }
2755                         if (_cache_lock_special(ncp) == 0) {
2756                                 /*
2757                                  * Successfully locked but we must re-test
2758                                  * conditions that might have changed since
2759                                  * we did not have the lock before.
2760                                  */
2761                                 if ((ncp->nc_flag & NCF_DESTROYED) ||
2762                                     ncp->nc_parent != par_nch->ncp) {
2763                                         _cache_put(ncp);
2764                                         goto restart;
2765                                 }
2766                                 _cache_auto_unresolve(mp, ncp);
2767                                 if (new_ncp)
2768                                         _cache_free(new_ncp);
2769                                 goto found;
2770                         }
2771                         _cache_get(ncp);        /* cycle the lock to block */
2772                         _cache_put(ncp);
2773                         _cache_drop(ncp);
2774                         goto restart;
2775                 }
2776         }
2777
2778         /*
2779          * We failed to locate an entry, create a new entry and add it to
2780          * the cache.  The parent ncp must also be locked so we
2781          * can link into it.
2782          *
2783          * We have to relookup after possibly blocking in kmalloc or
2784          * when locking par_nch.
2785          *
2786          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2787          *       mount case, in which case nc_name will be NULL.
2788          */
2789         if (new_ncp == NULL) {
2790                 spin_unlock_shared(&nchpp->spin);
2791                 new_ncp = cache_alloc(nlc->nlc_namelen);
2792                 if (nlc->nlc_namelen) {
2793                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2794                               nlc->nlc_namelen);
2795                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2796                 }
2797                 goto restart;
2798         }
2799
2800         /*
2801          * NOTE! The spinlock is held exclusively here because new_ncp
2802          *       is non-NULL.
2803          */
2804         if (par_locked == 0) {
2805                 spin_unlock(&nchpp->spin);
2806                 _cache_lock(par_nch->ncp);
2807                 par_locked = 1;
2808                 goto restart;
2809         }
2810
2811         /*
2812          * WARNING!  We still hold the spinlock.  We have to set the hash
2813          *           table entry atomically.
2814          */
2815         ncp = new_ncp;
2816         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2817         spin_unlock(&nchpp->spin);
2818         _cache_unlock(par_nch->ncp);
2819         /* par_locked = 0 - not used */
2820 found:
2821         /*
2822          * stats and namecache size management
2823          */
2824         if (ncp->nc_flag & NCF_UNRESOLVED)
2825                 ++gd->gd_nchstats->ncs_miss;
2826         else if (ncp->nc_vp)
2827                 ++gd->gd_nchstats->ncs_goodhits;
2828         else
2829                 ++gd->gd_nchstats->ncs_neghits;
2830         nch.mount = mp;
2831         nch.ncp = ncp;
2832         atomic_add_int(&nch.mount->mnt_refs, 1);
2833         return(nch);
2834 }
2835
2836 /*
2837  * Attempt to lookup a namecache entry and return with a shared namecache
2838  * lock.
2839  */
2840 int
2841 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
2842                            int excl, struct nchandle *res_nch)
2843 {
2844         struct namecache *ncp;
2845         struct nchash_head *nchpp;
2846         struct mount *mp;
2847         u_int32_t hash;
2848         globaldata_t gd;
2849
2850         /*
2851          * If exclusive requested or shared namecache locks are disabled,
2852          * return failure.
2853          */
2854         if (ncp_shared_lock_disable || excl)
2855                 return(EWOULDBLOCK);
2856
2857         numcalls++;
2858         gd = mycpu;
2859         mp = par_nch->mount;
2860
2861         /*
2862          * This is a good time to call it, no ncp's are locked by
2863          * the caller or us.
2864          */
2865         cache_hysteresis(1);
2866
2867         /*
2868          * Try to locate an existing entry
2869          */
2870         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2871         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2872         nchpp = NCHHASH(hash);
2873
2874         spin_lock_shared(&nchpp->spin);
2875
2876         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2877                 numchecks++;
2878
2879                 /*
2880                  * Break out if we find a matching entry.  Note that
2881                  * UNRESOLVED entries may match, but DESTROYED entries
2882                  * do not.
2883                  */
2884                 if (ncp->nc_parent == par_nch->ncp &&
2885                     ncp->nc_nlen == nlc->nlc_namelen &&
2886                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2887                     (ncp->nc_flag & NCF_DESTROYED) == 0
2888                 ) {
2889                         _cache_hold(ncp);
2890                         spin_unlock_shared(&nchpp->spin);
2891                         if (_cache_lock_shared_special(ncp) == 0) {
2892                                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2893                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
2894                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
2895                                         goto found;
2896                                 }
2897                                 _cache_unlock(ncp);
2898                         }
2899                         _cache_drop(ncp);
2900                         spin_lock_shared(&nchpp->spin);
2901                         break;
2902                 }
2903         }
2904
2905         /*
2906          * Failure
2907          */
2908         spin_unlock_shared(&nchpp->spin);
2909         return(EWOULDBLOCK);
2910
2911         /*
2912          * Success
2913          *
2914          * Note that nc_error might be non-zero (e.g ENOENT).
2915          */
2916 found:
2917         res_nch->mount = mp;
2918         res_nch->ncp = ncp;
2919         ++gd->gd_nchstats->ncs_goodhits;
2920         atomic_add_int(&res_nch->mount->mnt_refs, 1);
2921
2922         KKASSERT(ncp->nc_error != EWOULDBLOCK);
2923         return(ncp->nc_error);
2924 }
2925
2926 /*
2927  * This is a non-blocking verison of cache_nlookup() used by
2928  * nfs_readdirplusrpc_uio().  It can fail for any reason and
2929  * will return nch.ncp == NULL in that case.
2930  */
2931 struct nchandle
2932 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
2933 {
2934         struct nchandle nch;
2935         struct namecache *ncp;
2936         struct namecache *new_ncp;
2937         struct nchash_head *nchpp;
2938         struct mount *mp;
2939         u_int32_t hash;
2940         globaldata_t gd;
2941         int par_locked;
2942
2943         numcalls++;
2944         gd = mycpu;
2945         mp = par_nch->mount;
2946         par_locked = 0;
2947
2948         /*
2949          * Try to locate an existing entry
2950          */
2951         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2952         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2953         new_ncp = NULL;
2954         nchpp = NCHHASH(hash);
2955 restart:
2956         spin_lock(&nchpp->spin);
2957         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2958                 numchecks++;
2959
2960                 /*
2961                  * Break out if we find a matching entry.  Note that
2962                  * UNRESOLVED entries may match, but DESTROYED entries
2963                  * do not.
2964                  */
2965                 if (ncp->nc_parent == par_nch->ncp &&
2966                     ncp->nc_nlen == nlc->nlc_namelen &&
2967                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2968                     (ncp->nc_flag & NCF_DESTROYED) == 0
2969                 ) {
2970                         _cache_hold(ncp);
2971                         spin_unlock(&nchpp->spin);
2972                         if (par_locked) {
2973                                 _cache_unlock(par_nch->ncp);
2974                                 par_locked = 0;
2975                         }
2976                         if (_cache_lock_special(ncp) == 0) {
2977                                 _cache_auto_unresolve(mp, ncp);
2978                                 if (new_ncp) {
2979                                         _cache_free(new_ncp);
2980                                         new_ncp = NULL;
2981                                 }
2982                                 goto found;
2983                         }
2984                         _cache_drop(ncp);
2985                         goto failed;
2986                 }
2987         }
2988
2989         /*
2990          * We failed to locate an entry, create a new entry and add it to
2991          * the cache.  The parent ncp must also be locked so we
2992          * can link into it.
2993          *
2994          * We have to relookup after possibly blocking in kmalloc or
2995          * when locking par_nch.
2996          *
2997          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2998          *       mount case, in which case nc_name will be NULL.
2999          */
3000         if (new_ncp == NULL) {
3001                 spin_unlock(&nchpp->spin);
3002                 new_ncp = cache_alloc(nlc->nlc_namelen);
3003                 if (nlc->nlc_namelen) {
3004                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3005                               nlc->nlc_namelen);
3006                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3007                 }
3008                 goto restart;
3009         }
3010         if (par_locked == 0) {
3011                 spin_unlock(&nchpp->spin);
3012                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3013                         par_locked = 1;
3014                         goto restart;
3015                 }
3016                 goto failed;
3017         }
3018
3019         /*
3020          * WARNING!  We still hold the spinlock.  We have to set the hash
3021          *           table entry atomically.
3022          */
3023         ncp = new_ncp;
3024         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3025         spin_unlock(&nchpp->spin);
3026         _cache_unlock(par_nch->ncp);
3027         /* par_locked = 0 - not used */
3028 found:
3029         /*
3030          * stats and namecache size management
3031          */
3032         if (ncp->nc_flag & NCF_UNRESOLVED)
3033                 ++gd->gd_nchstats->ncs_miss;
3034         else if (ncp->nc_vp)
3035                 ++gd->gd_nchstats->ncs_goodhits;
3036         else
3037                 ++gd->gd_nchstats->ncs_neghits;
3038         nch.mount = mp;
3039         nch.ncp = ncp;
3040         atomic_add_int(&nch.mount->mnt_refs, 1);
3041         return(nch);
3042 failed:
3043         if (new_ncp) {
3044                 _cache_free(new_ncp);
3045                 new_ncp = NULL;
3046         }
3047         nch.mount = NULL;
3048         nch.ncp = NULL;
3049         return(nch);
3050 }
3051
3052 /*
3053  * The namecache entry is marked as being used as a mount point.
3054  * Locate the mount if it is visible to the caller.  The DragonFly
3055  * mount system allows arbitrary loops in the topology and disentangles
3056  * those loops by matching against (mp, ncp) rather than just (ncp).
3057  * This means any given ncp can dive any number of mounts, depending
3058  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3059  *
3060  * We use a very simple frontend cache to reduce SMP conflicts,
3061  * which we have to do because the mountlist scan needs an exclusive
3062  * lock around its ripout info list.  Not to mention that there might
3063  * be a lot of mounts.
3064  */
3065 struct findmount_info {
3066         struct mount *result;
3067         struct mount *nch_mount;
3068         struct namecache *nch_ncp;
3069 };
3070
3071 static
3072 struct ncmount_cache *
3073 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3074 {
3075         int hash;
3076
3077         hash = ((int)(intptr_t)mp / sizeof(*mp)) ^
3078                ((int)(intptr_t)ncp / sizeof(*ncp));
3079         hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE;
3080         return (&ncmount_cache[hash]);
3081 }
3082
3083 static
3084 int
3085 cache_findmount_callback(struct mount *mp, void *data)
3086 {
3087         struct findmount_info *info = data;
3088
3089         /*
3090          * Check the mount's mounted-on point against the passed nch.
3091          */
3092         if (mp->mnt_ncmounton.mount == info->nch_mount &&
3093             mp->mnt_ncmounton.ncp == info->nch_ncp
3094         ) {
3095             info->result = mp;
3096             atomic_add_int(&mp->mnt_refs, 1);
3097             return(-1);
3098         }
3099         return(0);
3100 }
3101
3102 struct mount *
3103 cache_findmount(struct nchandle *nch)
3104 {
3105         struct findmount_info info;
3106         struct ncmount_cache *ncc;
3107         struct mount *mp;
3108
3109         /*
3110          * Fast
3111          */
3112         if (ncmount_cache_enable == 0) {
3113                 ncc = NULL;
3114                 goto skip;
3115         }
3116         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3117         if (ncc->ncp == nch->ncp) {
3118                 spin_lock_shared(&ncc->spin);
3119                 if (ncc->isneg == 0 &&
3120                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
3121                         if (mp->mnt_ncmounton.mount == nch->mount &&
3122                             mp->mnt_ncmounton.ncp == nch->ncp) {
3123                                 /*
3124                                  * Cache hit (positive)
3125                                  */
3126                                 atomic_add_int(&mp->mnt_refs, 1);
3127                                 spin_unlock_shared(&ncc->spin);
3128                                 ++ncmount_cache_hit;
3129                                 return(mp);
3130                         }
3131                         /* else cache miss */
3132                 }
3133                 if (ncc->isneg &&
3134                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3135                         /*
3136                          * Cache hit (negative)
3137                          */
3138                         spin_unlock_shared(&ncc->spin);
3139                         ++ncmount_cache_hit;
3140                         return(NULL);
3141                 }
3142                 spin_unlock_shared(&ncc->spin);
3143         }
3144 skip:
3145
3146         /*
3147          * Slow
3148          */
3149         info.result = NULL;
3150         info.nch_mount = nch->mount;
3151         info.nch_ncp = nch->ncp;
3152         mountlist_scan(cache_findmount_callback, &info,
3153                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
3154
3155         /*
3156          * Cache the result.
3157          *
3158          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
3159          *                   only used for pointer comparisons and is not
3160          *                   referenced (otherwise there would be dangling
3161          *                   refs).
3162          *
3163          * Positive lookups: We cache the originating {ncp} and the target
3164          *                   (mp).  (mp) is referenced.
3165          *
3166          * Indeterminant:    If the match is undergoing an unmount we do
3167          *                   not cache it to avoid racing cache_unmounting(),
3168          *                   but still return the match.
3169          */
3170         if (ncc) {
3171                 spin_lock(&ncc->spin);
3172                 if (info.result == NULL) {
3173                         if (ncc->isneg == 0 && ncc->mp)
3174                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3175                         ncc->ncp = nch->ncp;
3176                         ncc->mp = nch->mount;
3177                         ncc->isneg = 1;
3178                         spin_unlock(&ncc->spin);
3179                         ++ncmount_cache_overwrite;
3180                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
3181                         if (ncc->isneg == 0 && ncc->mp)
3182                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3183                         atomic_add_int(&info.result->mnt_refs, 1);
3184                         ncc->ncp = nch->ncp;
3185                         ncc->mp = info.result;
3186                         ncc->isneg = 0;
3187                         spin_unlock(&ncc->spin);
3188                         ++ncmount_cache_overwrite;
3189                 } else {
3190                         spin_unlock(&ncc->spin);
3191                 }
3192                 ++ncmount_cache_miss;
3193         }
3194         return(info.result);
3195 }
3196
3197 void
3198 cache_dropmount(struct mount *mp)
3199 {
3200         atomic_add_int(&mp->mnt_refs, -1);
3201 }
3202
3203 void
3204 cache_ismounting(struct mount *mp)
3205 {
3206         struct nchandle *nch = &mp->mnt_ncmounton;
3207         struct ncmount_cache *ncc;
3208
3209         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3210         if (ncc->isneg &&
3211             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3212                 spin_lock(&ncc->spin);
3213                 if (ncc->isneg &&
3214                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3215                         ncc->ncp = NULL;
3216                         ncc->mp = NULL;
3217                 }
3218                 spin_unlock(&ncc->spin);
3219         }
3220 }
3221
3222 void
3223 cache_unmounting(struct mount *mp)
3224 {
3225         struct nchandle *nch = &mp->mnt_ncmounton;
3226         struct ncmount_cache *ncc;
3227
3228         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3229         if (ncc->isneg == 0 &&
3230             ncc->ncp == nch->ncp && ncc->mp == mp) {
3231                 spin_lock(&ncc->spin);
3232                 if (ncc->isneg == 0 &&
3233                     ncc->ncp == nch->ncp && ncc->mp == mp) {
3234                         atomic_add_int(&mp->mnt_refs, -1);
3235                         ncc->ncp = NULL;
3236                         ncc->mp = NULL;
3237                 }
3238                 spin_unlock(&ncc->spin);
3239         }
3240 }
3241
3242 /*
3243  * Resolve an unresolved namecache entry, generally by looking it up.
3244  * The passed ncp must be locked and refd.
3245  *
3246  * Theoretically since a vnode cannot be recycled while held, and since
3247  * the nc_parent chain holds its vnode as long as children exist, the
3248  * direct parent of the cache entry we are trying to resolve should
3249  * have a valid vnode.  If not then generate an error that we can
3250  * determine is related to a resolver bug.
3251  *
3252  * However, if a vnode was in the middle of a recyclement when the NCP
3253  * got locked, ncp->nc_vp might point to a vnode that is about to become
3254  * invalid.  cache_resolve() handles this case by unresolving the entry
3255  * and then re-resolving it.
3256  *
3257  * Note that successful resolution does not necessarily return an error
3258  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3259  * will be returned.
3260  */
3261 int
3262 cache_resolve(struct nchandle *nch, struct ucred *cred)
3263 {
3264         struct namecache *par_tmp;
3265         struct namecache *par;
3266         struct namecache *ncp;
3267         struct nchandle nctmp;
3268         struct mount *mp;
3269         struct vnode *dvp;
3270         int error;
3271
3272         ncp = nch->ncp;
3273         mp = nch->mount;
3274         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3275 restart:
3276         /*
3277          * If the ncp is already resolved we have nothing to do.  However,
3278          * we do want to guarentee that a usable vnode is returned when
3279          * a vnode is present, so make sure it hasn't been reclaimed.
3280          */
3281         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3282                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3283                         _cache_setunresolved(ncp);
3284                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3285                         return (ncp->nc_error);
3286         }
3287
3288         /*
3289          * If the ncp was destroyed it will never resolve again.  This
3290          * can basically only happen when someone is chdir'd into an
3291          * empty directory which is then rmdir'd.  We want to catch this
3292          * here and not dive the VFS because the VFS might actually
3293          * have a way to re-resolve the disconnected ncp, which will
3294          * result in inconsistencies in the cdir/nch for proc->p_fd.
3295          */
3296         if (ncp->nc_flag & NCF_DESTROYED) {
3297                 kprintf("Warning: cache_resolve: ncp '%s' was unlinked\n",
3298                         ncp->nc_name);
3299                 return(EINVAL);
3300         }
3301
3302         /*
3303          * Mount points need special handling because the parent does not
3304          * belong to the same filesystem as the ncp.
3305          */
3306         if (ncp == mp->mnt_ncmountpt.ncp)
3307                 return (cache_resolve_mp(mp));
3308
3309         /*
3310          * We expect an unbroken chain of ncps to at least the mount point,
3311          * and even all the way to root (but this code doesn't have to go
3312          * past the mount point).
3313          */
3314         if (ncp->nc_parent == NULL) {
3315                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3316                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3317                 ncp->nc_error = EXDEV;
3318                 return(ncp->nc_error);
3319         }
3320
3321         /*
3322          * The vp's of the parent directories in the chain are held via vhold()
3323          * due to the existance of the child, and should not disappear.
3324          * However, there are cases where they can disappear:
3325          *
3326          *      - due to filesystem I/O errors.
3327          *      - due to NFS being stupid about tracking the namespace and
3328          *        destroys the namespace for entire directories quite often.
3329          *      - due to forced unmounts.
3330          *      - due to an rmdir (parent will be marked DESTROYED)
3331          *
3332          * When this occurs we have to track the chain backwards and resolve
3333          * it, looping until the resolver catches up to the current node.  We
3334          * could recurse here but we might run ourselves out of kernel stack
3335          * so we do it in a more painful manner.  This situation really should
3336          * not occur all that often, or if it does not have to go back too
3337          * many nodes to resolve the ncp.
3338          */
3339         while ((dvp = cache_dvpref(ncp)) == NULL) {
3340                 /*
3341                  * This case can occur if a process is CD'd into a
3342                  * directory which is then rmdir'd.  If the parent is marked
3343                  * destroyed there is no point trying to resolve it.
3344                  */
3345                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3346                         return(ENOENT);
3347                 par = ncp->nc_parent;
3348                 _cache_hold(par);
3349                 _cache_lock(par);
3350                 while ((par_tmp = par->nc_parent) != NULL &&
3351                        par_tmp->nc_vp == NULL) {
3352                         _cache_hold(par_tmp);
3353                         _cache_lock(par_tmp);
3354                         _cache_put(par);
3355                         par = par_tmp;
3356                 }
3357                 if (par->nc_parent == NULL) {
3358                         kprintf("EXDEV case 2 %*.*s\n",
3359                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3360                         _cache_put(par);
3361                         return (EXDEV);
3362                 }
3363                 /*
3364                  * The parent is not set in stone, ref and lock it to prevent
3365                  * it from disappearing.  Also note that due to renames it
3366                  * is possible for our ncp to move and for par to no longer
3367                  * be one of its parents.  We resolve it anyway, the loop
3368                  * will handle any moves.
3369                  */
3370                 _cache_get(par);        /* additional hold/lock */
3371                 _cache_put(par);        /* from earlier hold/lock */
3372                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3373                         cache_resolve_mp(nch->mount);
3374                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3375                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
3376                         _cache_put(par);
3377                         continue;
3378                 } else {
3379                         if (par->nc_flag & NCF_UNRESOLVED) {
3380                                 nctmp.mount = mp;
3381                                 nctmp.ncp = par;
3382                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3383                         }
3384                         vrele(dvp);
3385                 }
3386                 if ((error = par->nc_error) != 0) {
3387                         if (par->nc_error != EAGAIN) {
3388                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3389                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3390                                     par->nc_error);
3391                                 _cache_put(par);
3392                                 return(error);
3393                         }
3394                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3395                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3396                 }
3397                 _cache_put(par);
3398                 /* loop */
3399         }
3400
3401         /*
3402          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3403          * ncp's and reattach them.  If this occurs the original ncp is marked
3404          * EAGAIN to force a relookup.
3405          *
3406          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3407          * ncp must already be resolved.
3408          */
3409         if (dvp) {
3410                 nctmp.mount = mp;
3411                 nctmp.ncp = ncp;
3412                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3413                 vrele(dvp);
3414         } else {
3415                 ncp->nc_error = EPERM;
3416         }
3417         if (ncp->nc_error == EAGAIN) {
3418                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3419                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3420                 goto restart;
3421         }
3422         return(ncp->nc_error);
3423 }
3424
3425 /*
3426  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3427  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3428  * re-resolution more often due to its mac-truck-smash-the-namecache
3429  * method of tracking namespace changes.
3430  *
3431  * The semantics for this call is that the passed ncp must be locked on
3432  * entry and will be locked on return.  However, if we actually have to
3433  * resolve the mount point we temporarily unlock the entry in order to
3434  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3435  * the unlock we have to recheck the flags after we relock.
3436  */
3437 static int
3438 cache_resolve_mp(struct mount *mp)
3439 {
3440         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3441         struct vnode *vp;
3442         int error;
3443
3444         KKASSERT(mp != NULL);
3445
3446         /*
3447          * If the ncp is already resolved we have nothing to do.  However,
3448          * we do want to guarentee that a usable vnode is returned when
3449          * a vnode is present, so make sure it hasn't been reclaimed.
3450          */
3451         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3452                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3453                         _cache_setunresolved(ncp);
3454         }
3455
3456         if (ncp->nc_flag & NCF_UNRESOLVED) {
3457                 _cache_unlock(ncp);
3458                 while (vfs_busy(mp, 0))
3459                         ;
3460                 error = VFS_ROOT(mp, &vp);
3461                 _cache_lock(ncp);
3462
3463                 /*
3464                  * recheck the ncp state after relocking.
3465                  */
3466                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3467                         ncp->nc_error = error;
3468                         if (error == 0) {
3469                                 _cache_setvp(mp, ncp, vp);
3470                                 vput(vp);
3471                         } else {
3472                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3473                                         " to resolve mount %p err=%d ncp=%p\n",
3474                                         mp, error, ncp);
3475                                 _cache_setvp(mp, ncp, NULL);
3476                         }
3477                 } else if (error == 0) {
3478                         vput(vp);
3479                 }
3480                 vfs_unbusy(mp);
3481         }
3482         return(ncp->nc_error);
3483 }
3484
3485 /*
3486  * Clean out negative cache entries when too many have accumulated.
3487  */
3488 static void
3489 _cache_cleanneg(int count)
3490 {
3491         struct namecache *ncp;
3492
3493         /*
3494          * Attempt to clean out the specified number of negative cache
3495          * entries.
3496          */
3497         while (count) {
3498                 spin_lock(&ncspin);
3499                 ncp = TAILQ_FIRST(&ncneglist);
3500                 if (ncp == NULL) {
3501                         spin_unlock(&ncspin);
3502                         break;
3503                 }
3504                 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
3505                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
3506                 _cache_hold(ncp);
3507                 spin_unlock(&ncspin);
3508
3509                 /*
3510                  * This can race, so we must re-check that the ncp
3511                  * is on the ncneglist after successfully locking it.
3512                  */
3513                 if (_cache_lock_special(ncp) == 0) {
3514                         if (ncp->nc_vp == NULL &&
3515                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3516                                 ncp = cache_zap(ncp, 1);
3517                                 if (ncp)
3518                                         _cache_drop(ncp);
3519                         } else {
3520                                 kprintf("cache_cleanneg: race avoided\n");
3521                                 _cache_unlock(ncp);
3522                         }
3523                 } else {
3524                         _cache_drop(ncp);
3525                 }
3526                 --count;
3527         }
3528 }
3529
3530 /*
3531  * Clean out positive cache entries when too many have accumulated.
3532  */
3533 static void
3534 _cache_cleanpos(int count)
3535 {
3536         static volatile int rover;
3537         struct nchash_head *nchpp;
3538         struct namecache *ncp;
3539         int rover_copy;
3540
3541         /*
3542          * Attempt to clean out the specified number of negative cache
3543          * entries.
3544          */
3545         while (count) {
3546                 rover_copy = ++rover;   /* MPSAFEENOUGH */
3547                 cpu_ccfence();
3548                 nchpp = NCHHASH(rover_copy);
3549
3550                 spin_lock_shared(&nchpp->spin);
3551                 ncp = LIST_FIRST(&nchpp->list);
3552                 while (ncp && (ncp->nc_flag & NCF_DESTROYED))
3553                         ncp = LIST_NEXT(ncp, nc_hash);
3554                 if (ncp)
3555                         _cache_hold(ncp);
3556                 spin_unlock_shared(&nchpp->spin);
3557
3558                 if (ncp) {
3559                         if (_cache_lock_special(ncp) == 0) {
3560                                 ncp = cache_zap(ncp, 1);
3561                                 if (ncp)
3562                                         _cache_drop(ncp);
3563                         } else {
3564                                 _cache_drop(ncp);
3565                         }
3566                 }
3567                 --count;
3568         }
3569 }
3570
3571 /*
3572  * This is a kitchen sink function to clean out ncps which we
3573  * tried to zap from cache_drop() but failed because we were
3574  * unable to acquire the parent lock.
3575  *
3576  * Such entries can also be removed via cache_inval_vp(), such
3577  * as when unmounting.
3578  */
3579 static void
3580 _cache_cleandefered(void)
3581 {
3582         struct nchash_head *nchpp;
3583         struct namecache *ncp;
3584         struct namecache dummy;
3585         int i;
3586
3587         numdefered = 0;
3588         bzero(&dummy, sizeof(dummy));
3589         dummy.nc_flag = NCF_DESTROYED;
3590         dummy.nc_refs = 1;
3591
3592         for (i = 0; i <= nchash; ++i) {
3593                 nchpp = &nchashtbl[i];
3594
3595                 spin_lock(&nchpp->spin);
3596                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
3597                 ncp = &dummy;
3598                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
3599                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
3600                                 continue;
3601                         LIST_REMOVE(&dummy, nc_hash);
3602                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
3603                         _cache_hold(ncp);
3604                         spin_unlock(&nchpp->spin);
3605                         if (_cache_lock_nonblock(ncp) == 0) {
3606                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
3607                                 _cache_unlock(ncp);
3608                         }
3609                         _cache_drop(ncp);
3610                         spin_lock(&nchpp->spin);
3611                         ncp = &dummy;
3612                 }
3613                 LIST_REMOVE(&dummy, nc_hash);
3614                 spin_unlock(&nchpp->spin);
3615         }
3616 }
3617
3618 /*
3619  * Name cache initialization, from vfsinit() when we are booting
3620  */
3621 void
3622 nchinit(void)
3623 {
3624         int i;
3625         globaldata_t gd;
3626
3627         /* initialise per-cpu namecache effectiveness statistics. */
3628         for (i = 0; i < ncpus; ++i) {
3629                 gd = globaldata_find(i);
3630                 gd->gd_nchstats = &nchstats[i];
3631         }
3632         TAILQ_INIT(&ncneglist);
3633         spin_init(&ncspin, "nchinit");
3634         nchashtbl = hashinit_ext(desiredvnodes / 2,
3635                                  sizeof(struct nchash_head),
3636                                  M_VFSCACHE, &nchash);
3637         for (i = 0; i <= (int)nchash; ++i) {
3638                 LIST_INIT(&nchashtbl[i].list);
3639                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
3640         }
3641         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
3642                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
3643         nclockwarn = 5 * hz;
3644 }
3645
3646 /*
3647  * Called from start_init() to bootstrap the root filesystem.  Returns
3648  * a referenced, unlocked namecache record.
3649  */
3650 void
3651 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
3652 {
3653         nch->ncp = cache_alloc(0);
3654         nch->mount = mp;
3655         atomic_add_int(&mp->mnt_refs, 1);
3656         if (vp)
3657                 _cache_setvp(nch->mount, nch->ncp, vp);
3658 }
3659
3660 /*
3661  * vfs_cache_setroot()
3662  *
3663  *      Create an association between the root of our namecache and
3664  *      the root vnode.  This routine may be called several times during
3665  *      booting.
3666  *
3667  *      If the caller intends to save the returned namecache pointer somewhere
3668  *      it must cache_hold() it.
3669  */
3670 void
3671 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
3672 {
3673         struct vnode *ovp;
3674         struct nchandle onch;
3675
3676         ovp = rootvnode;
3677         onch = rootnch;
3678         rootvnode = nvp;
3679         if (nch)
3680                 rootnch = *nch;
3681         else
3682                 cache_zero(&rootnch);
3683         if (ovp)
3684                 vrele(ovp);
3685         if (onch.ncp)
3686                 cache_drop(&onch);
3687 }
3688
3689 /*
3690  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
3691  * topology and is being removed as quickly as possible.  The new VOP_N*()
3692  * API calls are required to make specific adjustments using the supplied
3693  * ncp pointers rather then just bogusly purging random vnodes.
3694  *
3695  * Invalidate all namecache entries to a particular vnode as well as
3696  * any direct children of that vnode in the namecache.  This is a
3697  * 'catch all' purge used by filesystems that do not know any better.
3698  *
3699  * Note that the linkage between the vnode and its namecache entries will
3700  * be removed, but the namecache entries themselves might stay put due to
3701  * active references from elsewhere in the system or due to the existance of
3702  * the children.   The namecache topology is left intact even if we do not
3703  * know what the vnode association is.  Such entries will be marked
3704  * NCF_UNRESOLVED.
3705  */
3706 void
3707 cache_purge(struct vnode *vp)
3708 {
3709         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
3710 }
3711
3712 /*
3713  * Flush all entries referencing a particular filesystem.
3714  *
3715  * Since we need to check it anyway, we will flush all the invalid
3716  * entries at the same time.
3717  */
3718 #if 0
3719
3720 void
3721 cache_purgevfs(struct mount *mp)
3722 {
3723         struct nchash_head *nchpp;
3724         struct namecache *ncp, *nnp;
3725
3726         /*
3727          * Scan hash tables for applicable entries.
3728          */
3729         for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
3730                 spin_lock_wr(&nchpp->spin); XXX
3731                 ncp = LIST_FIRST(&nchpp->list);
3732                 if (ncp)
3733                         _cache_hold(ncp);
3734                 while (ncp) {
3735                         nnp = LIST_NEXT(ncp, nc_hash);
3736                         if (nnp)
3737                                 _cache_hold(nnp);
3738                         if (ncp->nc_mount == mp) {
3739                                 _cache_lock(ncp);
3740                                 ncp = cache_zap(ncp, 0);
3741                                 if (ncp)
3742                                         _cache_drop(ncp);
3743                         } else {
3744                                 _cache_drop(ncp);
3745                         }
3746                         ncp = nnp;
3747                 }
3748                 spin_unlock_wr(&nchpp->spin); XXX
3749         }
3750 }
3751
3752 #endif
3753
3754 static int disablecwd;
3755 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
3756     "Disable getcwd");
3757
3758 static u_long numcwdcalls;
3759 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
3760     "Number of current directory resolution calls");
3761 static u_long numcwdfailnf;
3762 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
3763     "Number of current directory failures due to lack of file");
3764 static u_long numcwdfailsz;
3765 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
3766     "Number of current directory failures due to large result");
3767 static u_long numcwdfound;
3768 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
3769     "Number of current directory resolution successes");
3770
3771 /*
3772  * MPALMOSTSAFE
3773  */
3774 int
3775 sys___getcwd(struct __getcwd_args *uap)
3776 {
3777         u_int buflen;
3778         int error;
3779         char *buf;
3780         char *bp;
3781
3782         if (disablecwd)
3783                 return (ENODEV);
3784
3785         buflen = uap->buflen;
3786         if (buflen == 0)
3787                 return (EINVAL);
3788         if (buflen > MAXPATHLEN)
3789                 buflen = MAXPATHLEN;
3790
3791         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
3792         bp = kern_getcwd(buf, buflen, &error);
3793         if (error == 0)
3794                 error = copyout(bp, uap->buf, strlen(bp) + 1);
3795         kfree(buf, M_TEMP);
3796         return (error);
3797 }
3798
3799 char *
3800 kern_getcwd(char *buf, size_t buflen, int *error)
3801 {
3802         struct proc *p = curproc;
3803         char *bp;
3804         int i, slash_prefixed;
3805         struct filedesc *fdp;
3806         struct nchandle nch;
3807         struct namecache *ncp;
3808
3809         numcwdcalls++;
3810         bp = buf;
3811         bp += buflen - 1;
3812         *bp = '\0';
3813         fdp = p->p_fd;
3814         slash_prefixed = 0;
3815
3816         nch = fdp->fd_ncdir;
3817         ncp = nch.ncp;
3818         if (ncp)
3819                 _cache_hold(ncp);
3820
3821         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
3822                nch.mount != fdp->fd_nrdir.mount)
3823         ) {
3824                 /*
3825                  * While traversing upwards if we encounter the root
3826                  * of the current mount we have to skip to the mount point
3827                  * in the underlying filesystem.
3828                  */
3829                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
3830                         nch = nch.mount->mnt_ncmounton;
3831                         _cache_drop(ncp);
3832                         ncp = nch.ncp;
3833                         if (ncp)
3834                                 _cache_hold(ncp);
3835                         continue;
3836                 }
3837
3838                 /*
3839                  * Prepend the path segment
3840                  */
3841                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3842                         if (bp == buf) {
3843                                 numcwdfailsz++;
3844                                 *error = ERANGE;
3845                                 bp = NULL;
3846                                 goto done;
3847                         }
3848                         *--bp = ncp->nc_name[i];
3849                 }
3850                 if (bp == buf) {
3851                         numcwdfailsz++;
3852                         *error = ERANGE;
3853                         bp = NULL;
3854                         goto done;
3855                 }
3856                 *--bp = '/';
3857                 slash_prefixed = 1;
3858
3859                 /*
3860                  * Go up a directory.  This isn't a mount point so we don't
3861                  * have to check again.
3862                  */
3863                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3864                         if (ncp_shared_lock_disable)
3865                                 _cache_lock(ncp);
3866                         else
3867                                 _cache_lock_shared(ncp);
3868                         if (nch.ncp != ncp->nc_parent) {
3869                                 _cache_unlock(ncp);
3870                                 continue;
3871                         }
3872                         _cache_hold(nch.ncp);
3873                         _cache_unlock(ncp);
3874                         break;
3875                 }
3876                 _cache_drop(ncp);
3877                 ncp = nch.ncp;
3878         }
3879         if (ncp == NULL) {
3880                 numcwdfailnf++;
3881                 *error = ENOENT;
3882                 bp = NULL;
3883                 goto done;
3884         }
3885         if (!slash_prefixed) {
3886                 if (bp == buf) {
3887                         numcwdfailsz++;
3888                         *error = ERANGE;
3889                         bp = NULL;
3890                         goto done;
3891                 }
3892                 *--bp = '/';
3893         }
3894         numcwdfound++;
3895         *error = 0;
3896 done:
3897         if (ncp)
3898                 _cache_drop(ncp);
3899         return (bp);
3900 }
3901
3902 /*
3903  * Thus begins the fullpath magic.
3904  *
3905  * The passed nchp is referenced but not locked.
3906  */
3907 static int disablefullpath;
3908 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
3909     &disablefullpath, 0,
3910     "Disable fullpath lookups");
3911
3912 static u_int numfullpathcalls;
3913 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
3914     &numfullpathcalls, 0,
3915     "Number of full path resolutions in progress");
3916 static u_int numfullpathfailnf;
3917 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
3918     &numfullpathfailnf, 0,
3919     "Number of full path resolution failures due to lack of file");
3920 static u_int numfullpathfailsz;
3921 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
3922     &numfullpathfailsz, 0,
3923     "Number of full path resolution failures due to insufficient memory");
3924 static u_int numfullpathfound;
3925 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
3926     &numfullpathfound, 0,
3927     "Number of full path resolution successes");
3928
3929 int
3930 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
3931                char **retbuf, char **freebuf, int guess)
3932 {
3933         struct nchandle fd_nrdir;
3934         struct nchandle nch;
3935         struct namecache *ncp;
3936         struct mount *mp, *new_mp;
3937         char *bp, *buf;
3938         int slash_prefixed;
3939         int error = 0;
3940         int i;
3941
3942         atomic_add_int(&numfullpathcalls, -1);
3943
3944         *retbuf = NULL;
3945         *freebuf = NULL;
3946
3947         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
3948         bp = buf + MAXPATHLEN - 1;
3949         *bp = '\0';
3950         if (nchbase)
3951                 fd_nrdir = *nchbase;
3952         else if (p != NULL)
3953                 fd_nrdir = p->p_fd->fd_nrdir;
3954         else
3955                 fd_nrdir = rootnch;
3956         slash_prefixed = 0;
3957         nch = *nchp;
3958         ncp = nch.ncp;
3959         if (ncp)
3960                 _cache_hold(ncp);
3961         mp = nch.mount;
3962
3963         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
3964                 new_mp = NULL;
3965
3966                 /*
3967                  * If we are asked to guess the upwards path, we do so whenever
3968                  * we encounter an ncp marked as a mountpoint. We try to find
3969                  * the actual mountpoint by finding the mountpoint with this
3970                  * ncp.
3971                  */
3972                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
3973                         new_mp = mount_get_by_nc(ncp);
3974                 }
3975                 /*
3976                  * While traversing upwards if we encounter the root
3977                  * of the current mount we have to skip to the mount point.
3978                  */
3979                 if (ncp == mp->mnt_ncmountpt.ncp) {
3980                         new_mp = mp;
3981                 }
3982                 if (new_mp) {
3983                         nch = new_mp->mnt_ncmounton;
3984                         _cache_drop(ncp);
3985                         ncp = nch.ncp;
3986                         if (ncp)
3987                                 _cache_hold(ncp);
3988                         mp = nch.mount;
3989                         continue;
3990                 }
3991
3992                 /*
3993                  * Prepend the path segment
3994                  */
3995                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3996                         if (bp == buf) {
3997                                 numfullpathfailsz++;
3998                                 kfree(buf, M_TEMP);
3999                                 error = ENOMEM;
4000                                 goto done;
4001                         }
4002                         *--bp = ncp->nc_name[i];
4003                 }
4004                 if (bp == buf) {
4005                         numfullpathfailsz++;
4006                         kfree(buf, M_TEMP);
4007                         error = ENOMEM;
4008                         goto done;
4009                 }
4010                 *--bp = '/';
4011                 slash_prefixed = 1;
4012
4013                 /*
4014                  * Go up a directory.  This isn't a mount point so we don't
4015                  * have to check again.
4016                  *
4017                  * We can only safely access nc_parent with ncp held locked.
4018                  */
4019                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4020                         _cache_lock(ncp);
4021                         if (nch.ncp != ncp->nc_parent) {
4022                                 _cache_unlock(ncp);
4023                                 continue;
4024                         }
4025                         _cache_hold(nch.ncp);
4026                         _cache_unlock(ncp);
4027                         break;
4028                 }
4029                 _cache_drop(ncp);
4030                 ncp = nch.ncp;
4031         }
4032         if (ncp == NULL) {
4033                 numfullpathfailnf++;
4034                 kfree(buf, M_TEMP);
4035                 error = ENOENT;
4036                 goto done;
4037         }
4038
4039         if (!slash_prefixed) {
4040                 if (bp == buf) {
4041                         numfullpathfailsz++;
4042                         kfree(buf, M_TEMP);
4043                         error = ENOMEM;
4044                         goto done;
4045                 }
4046                 *--bp = '/';
4047         }
4048         numfullpathfound++;
4049         *retbuf = bp;
4050         *freebuf = buf;
4051         error = 0;
4052 done:
4053         if (ncp)
4054                 _cache_drop(ncp);
4055         return(error);
4056 }
4057
4058 int
4059 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4060             char **freebuf, int guess)
4061 {
4062         struct namecache *ncp;
4063         struct nchandle nch;
4064         int error;
4065
4066         *freebuf = NULL;
4067         atomic_add_int(&numfullpathcalls, 1);
4068         if (disablefullpath)
4069                 return (ENODEV);
4070
4071         if (p == NULL)
4072                 return (EINVAL);
4073
4074         /* vn is NULL, client wants us to use p->p_textvp */
4075         if (vn == NULL) {
4076                 if ((vn = p->p_textvp) == NULL)
4077                         return (EINVAL);
4078         }
4079         spin_lock_shared(&vn->v_spin);
4080         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4081                 if (ncp->nc_nlen)
4082                         break;
4083         }
4084         if (ncp == NULL) {
4085                 spin_unlock_shared(&vn->v_spin);
4086                 return (EINVAL);
4087         }
4088         _cache_hold(ncp);
4089         spin_unlock_shared(&vn->v_spin);
4090
4091         atomic_add_int(&numfullpathcalls, -1);
4092         nch.ncp = ncp;
4093         nch.mount = vn->v_mount;
4094         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4095         _cache_drop(ncp);
4096         return (error);
4097 }