sys/kern/vfs_lock.c

   1 /*
   2  * Copyright (c) 2004,2013-2022 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * External lock/ref-related vnode functions
  37  *
  38  * vs_state transition locking requirements:
  39  *
  40  *      INACTIVE -> CACHED|DYING        vx_lock(excl) + vi->spin
  41  *      DYING    -> CACHED              vx_lock(excl)
  42  *      ACTIVE   -> INACTIVE            (none)       + v_spin + vi->spin
  43  *      INACTIVE -> ACTIVE              vn_lock(any) + v_spin + vi->spin
  44  *      CACHED   -> ACTIVE              vn_lock(any) + v_spin + vi->spin
  45  *
  46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
  47  *
  48  *       Switching into ACTIVE also requires a vref and vnode lock, however
  49  *       the vnode lock is allowed to be SHARED.
  50  *
  51  *       Switching into a CACHED or DYING state requires an exclusive vnode
  52  *       lock or vx_lock (which is almost the same thing but not quite).
  53  */
  54
  55 #include <sys/param.h>
  56 #include <sys/systm.h>
  57 #include <sys/kernel.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mount.h>
  60 #include <sys/proc.h>
  61 #include <sys/vnode.h>
  62 #include <sys/spinlock2.h>
  63 #include <sys/sysctl.h>
  64
  65 #include <machine/limits.h>
  66
  67 #include <vm/vm.h>
  68 #include <vm/vm_object.h>
  69
  70 #define VACT_MAX        10
  71 #define VACT_INC        2
  72
  73 static void vnode_terminate(struct vnode *vp);
  74
  75 static MALLOC_DEFINE_OBJ(M_VNODE, sizeof(struct vnode), "vnodes", "vnodes");
  76 static MALLOC_DEFINE(M_VNODE_HASH, "vnodelsthash", "vnode list hash");
  77
  78 /*
  79  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
  80  * are inserted prior to the mid point, and otherwise inserted
  81  * at the tail.
  82  *
  83  * The vnode code goes to great lengths to avoid moving vnodes between
  84  * lists, but sometimes it is unavoidable.  For this situation we try to
  85  * avoid lock contention but we do not try very hard to avoid cache line
  86  * congestion.  A modestly sized hash table is used.
  87  */
  88 #define VLIST_PRIME2    123462047LU
  89 #define VLIST_XOR       (uintptr_t)0xab4582fa8322fb71LLU
  90
  91 #define VLIST_HASH(vp)  (((uintptr_t)vp ^ VLIST_XOR) % \
  92                          VLIST_PRIME2 % (unsigned)ncpus)
  93
  94 static struct vnode_index *vnode_list_hash;
  95
  96 int  activevnodes = 0;
  97 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
  98         &activevnodes, 0, "Number of active nodes");
  99 int  cachedvnodes = 0;
 100 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
 101         &cachedvnodes, 0, "Number of total cached nodes");
 102 int  inactivevnodes = 0;
 103 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
 104         &inactivevnodes, 0, "Number of inactive nodes");
 105 static int batchfreevnodes = 5;
 106 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
 107         &batchfreevnodes, 0, "Number of vnodes to free at once");
 108
 109 static long auxrecovervnodes1;
 110 SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes1, CTLFLAG_RW,
 111         &auxrecovervnodes1, 0, "vnlru auxillary vnodes recovered");
 112 static long auxrecovervnodes2;
 113 SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes2, CTLFLAG_RW,
 114         &auxrecovervnodes2, 0, "vnlru auxillary vnodes recovered");
 115
 116 #ifdef TRACKVNODE
 117 static u_long trackvnode;
 118 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
 119                 &trackvnode, 0, "");
 120 #endif
 121
 122 /*
 123  * Called from vfsinit()
 124  */
 125 void
 126 vfs_lock_init(void)
 127 {
 128         int i;
 129
 130         kmalloc_obj_raise_limit(M_VNODE, 0);    /* unlimited */
 131         vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
 132                                   M_VNODE_HASH, M_ZERO | M_WAITOK);
 133         for (i = 0; i < ncpus; ++i) {
 134                 struct vnode_index *vi = &vnode_list_hash[i];
 135
 136                 TAILQ_INIT(&vi->inactive_list);
 137                 TAILQ_INIT(&vi->active_list);
 138                 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
 139                 spin_init(&vi->spin, "vfslock");
 140         }
 141 }
 142
 143 /*
 144  * Misc functions
 145  */
 146 static __inline
 147 void
 148 _vsetflags(struct vnode *vp, int flags)
 149 {
 150         atomic_set_int(&vp->v_flag, flags);
 151 }
 152
 153 static __inline
 154 void
 155 _vclrflags(struct vnode *vp, int flags)
 156 {
 157         atomic_clear_int(&vp->v_flag, flags);
 158 }
 159
 160 void
 161 vsetflags(struct vnode *vp, int flags)
 162 {
 163         _vsetflags(vp, flags);
 164 }
 165
 166 void
 167 vclrflags(struct vnode *vp, int flags)
 168 {
 169         _vclrflags(vp, flags);
 170 }
 171
 172 /*
 173  * Place the vnode on the active list.
 174  *
 175  * Caller must hold vp->v_spin
 176  */
 177 static __inline
 178 void
 179 _vactivate(struct vnode *vp)
 180 {
 181         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 182
 183 #ifdef TRACKVNODE
 184         if ((u_long)vp == trackvnode)
 185                 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
 186 #endif
 187         spin_lock(&vi->spin);
 188
 189         switch(vp->v_state) {
 190         case VS_ACTIVE:
 191                 spin_unlock(&vi->spin);
 192                 panic("_vactivate: already active");
 193                 /* NOT REACHED */
 194                 return;
 195         case VS_INACTIVE:
 196                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 197                 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
 198                 break;
 199         case VS_CACHED:
 200         case VS_DYING:
 201                 break;
 202         }
 203         TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
 204         vp->v_state = VS_ACTIVE;
 205         spin_unlock(&vi->spin);
 206         atomic_add_int(&mycpu->gd_activevnodes, 1);
 207 }
 208
 209 /*
 210  * Put a vnode on the inactive list.
 211  *
 212  * Caller must hold v_spin
 213  */
 214 static __inline
 215 void
 216 _vinactive(struct vnode *vp)
 217 {
 218         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 219
 220 #ifdef TRACKVNODE
 221         if ((u_long)vp == trackvnode) {
 222                 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
 223                 print_backtrace(-1);
 224         }
 225 #endif
 226         spin_lock(&vi->spin);
 227
 228         /*
 229          * Remove from active list if it is sitting on it
 230          */
 231         switch(vp->v_state) {
 232         case VS_ACTIVE:
 233                 TAILQ_REMOVE(&vi->active_list, vp, v_list);
 234                 atomic_add_int(&mycpu->gd_activevnodes, -1);
 235                 break;
 236         case VS_INACTIVE:
 237                 spin_unlock(&vi->spin);
 238                 panic("_vinactive: already inactive");
 239                 /* NOT REACHED */
 240                 return;
 241         case VS_CACHED:
 242         case VS_DYING:
 243                 break;
 244         }
 245
 246         /*
 247          * Distinguish between basically dead vnodes, vnodes with cached
 248          * data, and vnodes without cached data.  A rover will shift the
 249          * vnodes around as their cache status is lost.
 250          */
 251         if (vp->v_flag & VRECLAIMED) {
 252                 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
 253         } else {
 254                 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 255         }
 256         vp->v_state = VS_INACTIVE;
 257         spin_unlock(&vi->spin);
 258         atomic_add_int(&mycpu->gd_inactivevnodes, 1);
 259 }
 260
 261 /*
 262  * Add a ref to an active vnode.  This function should never be called
 263  * with an inactive vnode (use vget() instead), but might be called
 264  * with other states.
 265  */
 266 void
 267 vref(struct vnode *vp)
 268 {
 269         KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
 270                 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
 271         atomic_add_int(&vp->v_refcnt, 1);
 272 }
 273
 274 void
 275 vref_special(struct vnode *vp)
 276 {
 277         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 278                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 279 }
 280
 281 void
 282 synchronizevnodecount(void)
 283 {
 284         int nca = 0;
 285         int act = 0;
 286         int ina = 0;
 287         int i;
 288
 289         for (i = 0; i < ncpus; ++i) {
 290                 globaldata_t gd = globaldata_find(i);
 291                 nca += gd->gd_cachedvnodes;
 292                 act += gd->gd_activevnodes;
 293                 ina += gd->gd_inactivevnodes;
 294         }
 295         cachedvnodes = nca;
 296         activevnodes = act;
 297         inactivevnodes = ina;
 298 }
 299
 300 /*
 301  * Count number of cached vnodes.  This is middling expensive so be
 302  * careful not to make this call in the critical path.  Each cpu tracks
 303  * its own accumulator.  The individual accumulators must be summed
 304  * together to get an accurate value.
 305  */
 306 int
 307 countcachedvnodes(void)
 308 {
 309         int i;
 310         int n = 0;
 311
 312         for (i = 0; i < ncpus; ++i) {
 313                 globaldata_t gd = globaldata_find(i);
 314                 n += gd->gd_cachedvnodes;
 315         }
 316         return n;
 317 }
 318
 319 int
 320 countcachedandinactivevnodes(void)
 321 {
 322         int i;
 323         int n = 0;
 324
 325         for (i = 0; i < ncpus; ++i) {
 326                 globaldata_t gd = globaldata_find(i);
 327                 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
 328         }
 329         return n;
 330 }
 331
 332 /*
 333  * Release a ref on an active or inactive vnode.
 334  *
 335  * Caller has no other requirements.
 336  *
 337  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
 338  * transition, otherwise we leave the vnode in the active list and
 339  * do a lockless transition to 0, which is very important for the
 340  * critical path.
 341  *
 342  * (vrele() is not called when a vnode is being destroyed w/kfree)
 343  */
 344 void
 345 vrele(struct vnode *vp)
 346 {
 347         int count;
 348
 349 #if 1
 350         count = vp->v_refcnt;
 351         cpu_ccfence();
 352
 353         for (;;) {
 354                 KKASSERT((count & VREF_MASK) > 0);
 355                 KKASSERT(vp->v_state == VS_ACTIVE ||
 356                          vp->v_state == VS_INACTIVE);
 357
 358                 /*
 359                  * 2+ case
 360                  */
 361                 if ((count & VREF_MASK) > 1) {
 362                         if (atomic_fcmpset_int(&vp->v_refcnt,
 363                                                &count, count - 1)) {
 364                                 break;
 365                         }
 366                         continue;
 367                 }
 368
 369                 /*
 370                  * 1->0 transition case must handle possible finalization.
 371                  * When finalizing we transition 1->0x40000000.  Note that
 372                  * cachedvnodes is only adjusted on transitions to ->0.
 373                  *
 374                  * WARNING! VREF_TERMINATE can be cleared at any point
 375                  *          when the refcnt is non-zero (by vget()) and
 376                  *          the vnode has not been reclaimed.  Thus
 377                  *          transitions out of VREF_TERMINATE do not have
 378                  *          to mess with cachedvnodes.
 379                  */
 380                 if (count & VREF_FINALIZE) {
 381                         vx_lock(vp);
 382                         if (atomic_fcmpset_int(&vp->v_refcnt,
 383                                               &count, VREF_TERMINATE)) {
 384                                 vnode_terminate(vp);
 385                                 break;
 386                         }
 387                         vx_unlock(vp);
 388                 } else {
 389                         if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) {
 390                                 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
 391                                 break;
 392                         }
 393                 }
 394                 cpu_pause();
 395                 /* retry */
 396         }
 397 #else
 398         /*
 399          * XXX NOT YET WORKING!  Multiple threads can reference the vnode
 400          * after dropping their count, racing destruction, because this
 401          * code is not directly transitioning from 1->VREF_FINALIZE.
 402          */
 403         /*
 404          * Drop the ref-count.  On the 1->0 transition we check VREF_FINALIZE
 405          * and attempt to acquire VREF_TERMINATE if set.  It is possible for
 406          * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but
 407          * only one will be able to transition the vnode into the
 408          * VREF_TERMINATE state.
 409          *
 410          * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter
 411          *       this state once.
 412          */
 413         count = atomic_fetchadd_int(&vp->v_refcnt, -1);
 414         if ((count & VREF_MASK) == 1) {
 415                 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
 416                 --count;
 417                 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) {
 418                         vx_lock(vp);
 419                         if (atomic_fcmpset_int(&vp->v_refcnt,
 420                                                &count, VREF_TERMINATE)) {
 421                                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 422                                 vnode_terminate(vp);
 423                                 break;
 424                         }
 425                         vx_unlock(vp);
 426                 }
 427         }
 428 #endif
 429 }
 430
 431 /*
 432  * Add an auxiliary data structure reference to the vnode.  Auxiliary
 433  * references do not change the state of the vnode or prevent deactivation
 434  * or reclamation of the vnode, but will prevent the vnode from being
 435  * destroyed (kfree()'d).
 436  *
 437  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
 438  *           already be held by the caller.  vdrop() will clean up the
 439  *           free list state.
 440  */
 441 void
 442 vhold(struct vnode *vp)
 443 {
 444         atomic_add_int(&vp->v_auxrefs, 1);
 445 }
 446
 447 /*
 448  * Remove an auxiliary reference from the vnode.
 449  */
 450 void
 451 vdrop(struct vnode *vp)
 452 {
 453         atomic_add_int(&vp->v_auxrefs, -1);
 454 }
 455
 456 /*
 457  * Set VREF_FINALIZE to request that the vnode be inactivated
 458  * as soon as possible (on the 1->0 transition of its refs).
 459  *
 460  * Caller must have a ref on the vnode.
 461  *
 462  * This function has no effect if the vnode is already in termination
 463  * processing.
 464  */
 465 void
 466 vfinalize(struct vnode *vp)
 467 {
 468         if ((vp->v_refcnt & VREF_MASK) > 0)
 469                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 470 }
 471
 472 /*
 473  * This function is called on the 1->0 transition (which is actually
 474  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
 475  * of the vnode.
 476  *
 477  * Additional vrefs are allowed to race but will not result in a reentrant
 478  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
 479  * prevents additional 1->0 transitions.
 480  *
 481  * ONLY A VGET() CAN REACTIVATE THE VNODE.
 482  *
 483  * Caller must hold the VX lock.
 484  *
 485  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
 486  *
 487  * NOTE: The vnode may be marked inactive with dirty buffers
 488  *       or dirty pages in its cached VM object still present.
 489  *
 490  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
 491  *       previously be active).  We lose control of the vnode the instant
 492  *       it is placed on the free list.
 493  *
 494  *       The VX lock is required when transitioning to VS_CACHED but is
 495  *       not sufficient for the vshouldfree() interlocked test or when
 496  *       transitioning away from VS_CACHED.  v_spin is also required for
 497  *       those cases.
 498  */
 499 static
 500 void
 501 vnode_terminate(struct vnode *vp)
 502 {
 503         KKASSERT(vp->v_state == VS_ACTIVE);
 504
 505         if ((vp->v_flag & VINACTIVE) == 0) {
 506                 _vsetflags(vp, VINACTIVE);
 507                 if (vp->v_mount)
 508                         VOP_INACTIVE(vp);
 509         }
 510         spin_lock(&vp->v_spin);
 511         _vinactive(vp);
 512         spin_unlock(&vp->v_spin);
 513
 514         vx_unlock(vp);
 515 }
 516
 517 /****************************************************************
 518  *                      VX LOCKING FUNCTIONS                    *
 519  ****************************************************************
 520  *
 521  * These functions lock vnodes for reclamation and deactivation related
 522  * activities.  The caller must already be holding some sort of reference
 523  * on the vnode.
 524  */
 525 void
 526 vx_lock(struct vnode *vp)
 527 {
 528         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 529         spin_lock_update_only(&vp->v_spin);
 530 }
 531
 532 void
 533 vx_unlock(struct vnode *vp)
 534 {
 535         spin_unlock_update_only(&vp->v_spin);
 536         lockmgr(&vp->v_lock, LK_RELEASE);
 537 }
 538
 539 /*
 540  * Downgrades a VX lock to a normal VN lock.  The lock remains EXCLUSIVE.
 541  *
 542  * Generally required after calling getnewvnode() if the intention is
 543  * to return a normal locked vnode to the caller.
 544  */
 545 void
 546 vx_downgrade(struct vnode *vp)
 547 {
 548         spin_unlock_update_only(&vp->v_spin);
 549 }
 550
 551 /****************************************************************
 552  *                      VNODE ACQUISITION FUNCTIONS             *
 553  ****************************************************************
 554  *
 555  * These functions must be used when accessing a vnode that has no
 556  * chance of being destroyed in a SMP race.  That means the caller will
 557  * usually either hold an auxiliary reference (such as the namecache)
 558  * or hold some other lock that ensures that the vnode cannot be destroyed.
 559  *
 560  * These functions are MANDATORY for any code chain accessing a vnode
 561  * whos activation state is not known.
 562  *
 563  * vget() can be called with LK_NOWAIT and will return EBUSY if the
 564  * lock cannot be immediately acquired.
 565  *
 566  * vget()/vput() are used when reactivation is desired.
 567  *
 568  * vx_get() and vx_put() are used when reactivation is not desired.
 569  */
 570 int
 571 vget(struct vnode *vp, int flags)
 572 {
 573         int error;
 574
 575         /*
 576          * A lock type must be passed
 577          */
 578         if ((flags & LK_TYPE_MASK) == 0) {
 579                 panic("vget() called with no lock specified!");
 580                 /* NOT REACHED */
 581         }
 582
 583         /*
 584          * Reference the structure and then acquire the lock.
 585          *
 586          * NOTE: The requested lock might be a shared lock and does
 587          *       not protect our access to the refcnt or other fields.
 588          */
 589         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 590                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 591
 592         if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
 593                 /*
 594                  * The lock failed, undo and return an error.  This will not
 595                  * normally trigger a termination.
 596                  */
 597                 vrele(vp);
 598         } else if (vp->v_flag & VRECLAIMED) {
 599                 /*
 600                  * The node is being reclaimed and cannot be reactivated
 601                  * any more, undo and return ENOENT.
 602                  */
 603                 vn_unlock(vp);
 604                 vrele(vp);
 605                 error = ENOENT;
 606         } else if (vp->v_state == VS_ACTIVE) {
 607                 /*
 608                  * A VS_ACTIVE vnode coupled with the fact that we have
 609                  * a vnode lock (even if shared) prevents v_state from
 610                  * changing.  Since the vnode is not in a VRECLAIMED state,
 611                  * we can safely clear VINACTIVE.
 612                  *
 613                  * It is possible for a shared lock to cause a race with
 614                  * another thread that is also in the process of clearing
 615                  * VREF_TERMINATE, meaning that we might return with it still
 616                  * set and then assert in a later vref().  The solution is to
 617                  * unconditionally clear VREF_TERMINATE here as well.
 618                  *
 619                  * NOTE! Multiple threads may clear VINACTIVE if this is
 620                  *       shared lock.  This race is allowed.
 621                  */
 622                 if (vp->v_flag & VINACTIVE)
 623                         _vclrflags(vp, VINACTIVE);      /* SMP race ok */
 624                 if (vp->v_act < VACT_MAX) {
 625                         vp->v_act += VACT_INC;
 626                         if (vp->v_act > VACT_MAX)       /* SMP race ok */
 627                                 vp->v_act = VACT_MAX;
 628                 }
 629                 error = 0;
 630                 if (vp->v_refcnt & VREF_TERMINATE)      /* SMP race ok */
 631                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
 632         } else {
 633                 /*
 634                  * If the vnode is not VS_ACTIVE it must be reactivated
 635                  * in addition to clearing VINACTIVE.  An exclusive spin_lock
 636                  * is needed to manipulate the vnode's list.
 637                  *
 638                  * Because the lockmgr lock might be shared, we might race
 639                  * another reactivation, which we handle.  In this situation,
 640                  * however, the refcnt prevents other v_state races.
 641                  *
 642                  * As with above, clearing VINACTIVE is allowed to race other
 643                  * clearings of VINACTIVE.
 644                  *
 645                  * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
 646                  * the refcnt is non-zero and the vnode has not been
 647                  * reclaimed.  This also means that the transitions do
 648                  * not affect cachedvnodes.
 649                  *
 650                  * It is possible for a shared lock to cause a race with
 651                  * another thread that is also in the process of clearing
 652                  * VREF_TERMINATE, meaning that we might return with it still
 653                  * set and then assert in a later vref().  The solution is to
 654                  * unconditionally clear VREF_TERMINATE here as well.
 655                  */
 656                 _vclrflags(vp, VINACTIVE);
 657                 vp->v_act += VACT_INC;
 658                 if (vp->v_act > VACT_MAX)       /* SMP race ok */
 659                         vp->v_act = VACT_MAX;
 660                 spin_lock(&vp->v_spin);
 661
 662                 switch(vp->v_state) {
 663                 case VS_INACTIVE:
 664                         _vactivate(vp);
 665                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
 666                                                         VREF_FINALIZE);
 667                         spin_unlock(&vp->v_spin);
 668                         break;
 669                 case VS_CACHED:
 670                         _vactivate(vp);
 671                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
 672                                                         VREF_FINALIZE);
 673                         spin_unlock(&vp->v_spin);
 674                         break;
 675                 case VS_ACTIVE:
 676                         atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
 677                                                         VREF_TERMINATE);
 678                         spin_unlock(&vp->v_spin);
 679                         break;
 680                 case VS_DYING:
 681                         spin_unlock(&vp->v_spin);
 682                         panic("Impossible VS_DYING state");
 683                         break;
 684                 }
 685                 error = 0;
 686         }
 687         return(error);
 688 }
 689
 690 #ifdef DEBUG_VPUT
 691
 692 void
 693 debug_vput(struct vnode *vp, const char *filename, int line)
 694 {
 695         kprintf("vput(%p) %s:%d\n", vp, filename, line);
 696         vn_unlock(vp);
 697         vrele(vp);
 698 }
 699
 700 #else
 701
 702 void
 703 vput(struct vnode *vp)
 704 {
 705         vn_unlock(vp);
 706         vrele(vp);
 707 }
 708
 709 #endif
 710
 711 /*
 712  * Acquire the vnode lock unguarded.
 713  *
 714  * The non-blocking version also uses a slightly different mechanic.
 715  * This function will explicitly fail not only if it cannot acquire
 716  * the lock normally, but also if the caller already holds a lock.
 717  *
 718  * The adjusted mechanic is used to close a loophole where complex
 719  * VOP_RECLAIM code can circle around recursively and allocate the
 720  * same vnode it is trying to destroy from the freelist.
 721  *
 722  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
 723  * cause the incorrect behavior to occur.  If not for that lockmgr()
 724  * would do the right thing.
 725  *
 726  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
 727  */
 728 void
 729 vx_get(struct vnode *vp)
 730 {
 731         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 732                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 733         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 734         spin_lock_update_only(&vp->v_spin);
 735 }
 736
 737 int
 738 vx_get_nonblock(struct vnode *vp)
 739 {
 740         int error;
 741
 742         if (lockinuse(&vp->v_lock))
 743                 return(EBUSY);
 744         error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
 745         if (error == 0) {
 746                 spin_lock_update_only(&vp->v_spin);
 747                 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 748                         atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 749         }
 750         return(error);
 751 }
 752
 753 /*
 754  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
 755  * any needed state transitions.
 756  *
 757  * However, filesystems use this function to get rid of unwanted new vnodes
 758  * so try to get the vnode on the correct queue in that case.
 759  */
 760 void
 761 vx_put(struct vnode *vp)
 762 {
 763         if (vp->v_type == VNON || vp->v_type == VBAD)
 764                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 765         spin_unlock_update_only(&vp->v_spin);
 766         lockmgr(&vp->v_lock, LK_RELEASE);
 767         vrele(vp);
 768 }
 769
 770 /*
 771  * Try to reuse a vnode from the free list.  This function is somewhat
 772  * advisory in that NULL can be returned as a normal case, even if free
 773  * vnodes are present.
 774  *
 775  * The scan is limited because it can result in excessive CPU use during
 776  * periods of extreme vnode use.
 777  *
 778  * NOTE: The returned vnode is not completely initialized.
 779  *       The returned vnode will be VX locked.
 780  */
 781 static
 782 struct vnode *
 783 cleanfreevnode(int maxcount)
 784 {
 785         struct vnode_index *vi;
 786         struct vnode *vp;
 787         int count;
 788         int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
 789         int ri;
 790         int cpu_count;
 791         int cachedvnodes;
 792
 793         /*
 794          * Try to deactivate some vnodes cached on the active list.  We
 795          * generally want a 50-50 balance active vs inactive.
 796          */
 797         cachedvnodes = countcachedvnodes();
 798         if (cachedvnodes < inactivevnodes)
 799                 goto skip;
 800
 801         ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
 802
 803         for (count = 0; count < maxcount * 2; ++count, ++ri) {
 804                 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
 805
 806                 spin_lock(&vi->spin);
 807
 808                 vp = TAILQ_NEXT(&vi->active_rover, v_list);
 809                 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
 810                 if (vp == NULL) {
 811                         TAILQ_INSERT_HEAD(&vi->active_list,
 812                                           &vi->active_rover, v_list);
 813                 } else {
 814                         TAILQ_INSERT_AFTER(&vi->active_list, vp,
 815                                            &vi->active_rover, v_list);
 816                 }
 817                 if (vp == NULL) {
 818                         spin_unlock(&vi->spin);
 819                         continue;
 820                 }
 821
 822                 /*
 823                  * Don't try to deactivate if someone has the vp referenced.
 824                  */
 825                 if ((vp->v_refcnt & VREF_MASK) != 0) {
 826                         spin_unlock(&vi->spin);
 827                         vp->v_act += VACT_INC;
 828                         if (vp->v_act > VACT_MAX)       /* SMP race ok */
 829                                 vp->v_act = VACT_MAX;
 830                         continue;
 831                 }
 832
 833                 /*
 834                  * Calculate the deactivation weight.  Reduce v_act less
 835                  * if the vnode's object has a lot of VM pages.
 836                  *
 837                  * XXX obj race
 838                  */
 839                 if (vp->v_act > 0) {
 840                         vm_object_t obj;
 841
 842                         if ((obj = vp->v_object) != NULL &&
 843                             obj->resident_page_count >= trigger)
 844                         {
 845                                 vp->v_act -= 1;
 846                         } else {
 847                                 vp->v_act -= VACT_INC;
 848                         }
 849                         if (vp->v_act < 0)
 850                                 vp->v_act = 0;
 851                         spin_unlock(&vi->spin);
 852                         continue;
 853                 }
 854
 855                 /*
 856                  * If v_auxrefs is not the expected value the vnode might
 857                  * reside in the namecache topology on an internal node and
 858                  * not at a leaf.  v_auxrefs can be wrong for other reasons,
 859                  * but this is the most likely.
 860                  *
 861                  * Such vnodes will not be recycled by vnlru later on in
 862                  * its inactive scan, so try to make the vnode presentable
 863                  * and only move it to the inactive queue if we can.
 864                  *
 865                  * On success, the vnode is disconnected from the namecache
 866                  * topology entirely, making vnodes above it in the topology
 867                  * recycleable.  This will allow the active scan to continue
 868                  * to make progress in balancing the active and inactive
 869                  * lists.
 870                  */
 871                 if (vp->v_auxrefs != vp->v_namecache_count) {
 872                         if (vx_get_nonblock(vp) == 0) {
 873                                 spin_unlock(&vi->spin);
 874                                 if ((vp->v_refcnt & VREF_MASK) == 1)
 875                                         cache_inval_vp_quick(vp);
 876                                 if (vp->v_auxrefs == vp->v_namecache_count)
 877                                         ++auxrecovervnodes1;
 878                                 vx_put(vp);
 879                         } else {
 880                                 spin_unlock(&vi->spin);
 881                         }
 882                         continue;
 883                 }
 884
 885                 /*
 886                  * Try to deactivate the vnode.  It is ok if v_auxrefs
 887                  * races every once in a while, we just don't want an
 888                  * excess of unreclaimable vnodes on the inactive list.
 889                  */
 890                 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 891                         atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 892                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 893
 894                 spin_unlock(&vi->spin);
 895                 vrele(vp);
 896         }
 897
 898         vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
 899
 900 skip:
 901         /*
 902          * Loop trying to lock the first vnode on the free list.
 903          * Cycle if we can't.
 904          */
 905         cpu_count = ncpus;
 906         ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
 907
 908         for (count = 0; count < maxcount; ++count, ++ri) {
 909                 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
 910
 911                 spin_lock(&vi->spin);
 912
 913                 vp = TAILQ_FIRST(&vi->inactive_list);
 914                 if (vp == NULL) {
 915                         spin_unlock(&vi->spin);
 916                         if (--cpu_count == 0)
 917                                 break;
 918                         ri = (ri + 16) & ~15;
 919                         --ri;
 920                         continue;
 921                 }
 922
 923                 /*
 924                  * non-blocking vx_get will also ref the vnode on success.
 925                  */
 926                 if (vx_get_nonblock(vp)) {
 927                         KKASSERT(vp->v_state == VS_INACTIVE);
 928                         TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 929                         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 930                         spin_unlock(&vi->spin);
 931                         continue;
 932                 }
 933
 934                 /*
 935                  * Because we are holding vfs_spin the vnode should currently
 936                  * be inactive and VREF_TERMINATE should still be set.
 937                  *
 938                  * Once vfs_spin is released the vnode's state should remain
 939                  * unmodified due to both the lock and ref on it.
 940                  */
 941                 KKASSERT(vp->v_state == VS_INACTIVE);
 942                 spin_unlock(&vi->spin);
 943 #ifdef TRACKVNODE
 944                 if ((u_long)vp == trackvnode)
 945                         kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
 946 #endif
 947
 948                 /*
 949                  * The active scan already did this, but some leakage can
 950                  * happen.  Don't let an easily recycleable vnode go to
 951                  * waste!
 952                  */
 953                 if (vp->v_auxrefs != vp->v_namecache_count &&
 954                     (vp->v_refcnt & ~VREF_FINALIZE) == VREF_TERMINATE + 1)
 955                 {
 956                         cache_inval_vp_quick(vp);
 957                         if (vp->v_auxrefs == vp->v_namecache_count)
 958                                 ++auxrecovervnodes2;
 959                 }
 960
 961                 /*
 962                  * Do not reclaim/reuse a vnode while auxillary refs exists.
 963                  * This includes namecache refs due to a related ncp being
 964                  * locked or having children, a VM object association, or
 965                  * other hold users.
 966                  *
 967                  * Do not reclaim/reuse a vnode if someone else has a real
 968                  * ref on it.  This can occur if a filesystem temporarily
 969                  * releases the vnode lock during VOP_RECLAIM.
 970                  */
 971                 if (vp->v_auxrefs != vp->v_namecache_count ||
 972                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
 973 failed:
 974                         if (vp->v_state == VS_INACTIVE) {
 975                                 spin_lock(&vi->spin);
 976                                 if (vp->v_state == VS_INACTIVE) {
 977                                         TAILQ_REMOVE(&vi->inactive_list,
 978                                                      vp, v_list);
 979                                         TAILQ_INSERT_TAIL(&vi->inactive_list,
 980                                                           vp, v_list);
 981                                 }
 982                                 spin_unlock(&vi->spin);
 983                         }
 984                         vx_put(vp);
 985                         continue;
 986                 }
 987
 988                 /*
 989                  * VINACTIVE and VREF_TERMINATE are expected to both be set
 990                  * for vnodes pulled from the inactive list, and cannot be
 991                  * changed while we hold the vx lock.
 992                  *
 993                  * Try to reclaim the vnode.
 994                  *
 995                  * The cache_inval_vp() can fail if any of the namecache
 996                  * elements are actively locked, preventing the vnode from
 997                  * bring reclaimed.  This is desired operation as it gives
 998                  * the namecache code certain guarantees just by holding
 999                  * a ncp.
1000                  */
1001                 KKASSERT(vp->v_flag & VINACTIVE);
1002                 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1003
1004                 if ((vp->v_flag & VRECLAIMED) == 0) {
1005                         if (cache_inval_vp_nonblock(vp))
1006                                 goto failed;
1007                         vgone_vxlocked(vp);
1008                         /* vnode is still VX locked */
1009                 }
1010
1011                 /*
1012                  * At this point if there are no other refs or auxrefs on
1013                  * the vnode with the inactive list locked, and we remove
1014                  * the vnode from the inactive list, it should not be
1015                  * possible for anyone else to access the vnode any more.
1016                  *
1017                  * Since the vnode is in a VRECLAIMED state, no new
1018                  * namecache associations could have been made and the
1019                  * vnode should have already been removed from its mountlist.
1020                  *
1021                  * Since we hold a VX lock on the vnode it cannot have been
1022                  * reactivated (moved out of the inactive list).
1023                  */
1024                 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1025                 spin_lock(&vi->spin);
1026                 if (vp->v_auxrefs ||
1027                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1028                         spin_unlock(&vi->spin);
1029                         goto failed;
1030                 }
1031                 KKASSERT(vp->v_state == VS_INACTIVE);
1032                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1033                 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1034                 vp->v_state = VS_DYING;
1035                 spin_unlock(&vi->spin);
1036
1037                 /*
1038                  * Nothing should have been able to access this vp.  Only
1039                  * our ref should remain now.
1040                  */
1041                 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1042                 KASSERT(vp->v_refcnt == 1,
1043                         ("vp %p badrefs %08x", vp, vp->v_refcnt));
1044
1045                 /*
1046                  * Return a VX locked vnode suitable for reuse.
1047                  */
1048                 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
1049                 return(vp);
1050         }
1051         vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
1052         return(NULL);
1053 }
1054
1055 /*
1056  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
1057  *
1058  * All new vnodes set the VAGE flags.  An open() of the vnode will
1059  * decrement the (2-bit) flags.  Vnodes which are opened several times
1060  * are thus retained in the cache over vnodes which are merely stat()d.
1061  *
1062  * We attempt to reuse an already-recycled vnode from our pcpu inactive
1063  * queue first, and allocate otherwise.  Attempting to recycle inactive
1064  * vnodes here can lead to numerous deadlocks, particularly with
1065  * softupdates.
1066  */
1067 struct vnode *
1068 allocvnode(int lktimeout, int lkflags)
1069 {
1070         struct vnode *vp;
1071         struct vnode_index *vi;
1072
1073         /*
1074          * lktimeout only applies when LK_TIMELOCK is used, and only
1075          * the pageout daemon uses it.  The timeout may not be zero
1076          * or the pageout daemon can deadlock in low-VM situations.
1077          */
1078         if (lktimeout == 0)
1079                 lktimeout = hz / 10;
1080
1081         /*
1082          * Do not flag for synchronous recyclement unless there are enough
1083          * freeable vnodes to recycle and the number of vnodes has
1084          * significantly exceeded our target.  We want the normal vnlru
1085          * process to handle the cleaning (at 9/10's) before we are forced
1086          * to flag it here at 11/10's for userexit path processing.
1087          */
1088         if (numvnodes >= maxvnodes * 11 / 10 &&
1089             cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
1090                 struct thread *td = curthread;
1091                 if (td->td_lwp)
1092                         atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
1093         }
1094
1095         /*
1096          * Try to trivially reuse a reclaimed vnode from the head of the
1097          * inactive list for this cpu.  Any vnode cycling which occurs
1098          * which terminates the vnode will cause it to be returned to the
1099          * same pcpu structure (e.g. unlink calls).
1100          */
1101         vi = &vnode_list_hash[mycpuid];
1102         spin_lock(&vi->spin);
1103
1104         vp = TAILQ_FIRST(&vi->inactive_list);
1105         if (vp && (vp->v_flag & VRECLAIMED)) {
1106                 /*
1107                  * non-blocking vx_get will also ref the vnode on success.
1108                  */
1109                 if (vx_get_nonblock(vp)) {
1110                         KKASSERT(vp->v_state == VS_INACTIVE);
1111                         TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1112                         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
1113                         spin_unlock(&vi->spin);
1114                         goto slower;
1115                 }
1116
1117                 /*
1118                  * Because we are holding vfs_spin the vnode should currently
1119                  * be inactive and VREF_TERMINATE should still be set.
1120                  *
1121                  * Once vfs_spin is released the vnode's state should remain
1122                  * unmodified due to both the lock and ref on it.
1123                  */
1124                 KKASSERT(vp->v_state == VS_INACTIVE);
1125 #ifdef TRACKVNODE
1126                 if ((u_long)vp == trackvnode)
1127                         kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1128 #endif
1129
1130                 /*
1131                  * Do not reclaim/reuse a vnode while auxillary refs exists.
1132                  * This includes namecache refs due to a related ncp being
1133                  * locked or having children, a VM object association, or
1134                  * other hold users.
1135                  *
1136                  * Do not reclaim/reuse a vnode if someone else has a real
1137                  * ref on it.  This can occur if a filesystem temporarily
1138                  * releases the vnode lock during VOP_RECLAIM.
1139                  */
1140                 if (vp->v_auxrefs ||
1141                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1142                         if (vp->v_state == VS_INACTIVE) {
1143                                 TAILQ_REMOVE(&vi->inactive_list,
1144                                              vp, v_list);
1145                                 TAILQ_INSERT_TAIL(&vi->inactive_list,
1146                                                   vp, v_list);
1147                         }
1148                         spin_unlock(&vi->spin);
1149                         vx_put(vp);
1150                         goto slower;
1151                 }
1152
1153                 /*
1154                  * VINACTIVE and VREF_TERMINATE are expected to both be set
1155                  * for vnodes pulled from the inactive list, and cannot be
1156                  * changed while we hold the vx lock.
1157                  *
1158                  * Try to reclaim the vnode.
1159                  */
1160                 KKASSERT(vp->v_flag & VINACTIVE);
1161                 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1162
1163                 if ((vp->v_flag & VRECLAIMED) == 0) {
1164                         spin_unlock(&vi->spin);
1165                         vx_put(vp);
1166                         goto slower;
1167                 }
1168
1169                 /*
1170                  * At this point if there are no other refs or auxrefs on
1171                  * the vnode with the inactive list locked, and we remove
1172                  * the vnode from the inactive list, it should not be
1173                  * possible for anyone else to access the vnode any more.
1174                  *
1175                  * Since the vnode is in a VRECLAIMED state, no new
1176                  * namecache associations could have been made and the
1177                  * vnode should have already been removed from its mountlist.
1178                  *
1179                  * Since we hold a VX lock on the vnode it cannot have been
1180                  * reactivated (moved out of the inactive list).
1181                  */
1182                 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1183                 KKASSERT(vp->v_state == VS_INACTIVE);
1184                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1185                 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1186                 vp->v_state = VS_DYING;
1187                 spin_unlock(&vi->spin);
1188
1189                 /*
1190                  * Nothing should have been able to access this vp.  Only
1191                  * our ref should remain now.
1192                  *
1193                  * At this point we can kfree() the vnode if we want to.
1194                  * Instead, we reuse it for the allocation.
1195                  */
1196                 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1197                 KASSERT(vp->v_refcnt == 1,
1198                         ("vp %p badrefs %08x", vp, vp->v_refcnt));
1199                 vx_unlock(vp);          /* safety: keep the API clean */
1200                 bzero(vp, sizeof(*vp));
1201         } else {
1202                 spin_unlock(&vi->spin);
1203 slower:
1204                 vp = kmalloc_obj(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1205                 atomic_add_int(&numvnodes, 1);
1206         }
1207
1208         lwkt_token_init(&vp->v_token, "vnode");
1209         lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1210         TAILQ_INIT(&vp->v_namecache);
1211         RB_INIT(&vp->v_rbclean_tree);
1212         RB_INIT(&vp->v_rbdirty_tree);
1213         RB_INIT(&vp->v_rbhash_tree);
1214         spin_init(&vp->v_spin, "allocvnode");
1215
1216         vx_lock(vp);
1217         vp->v_refcnt = 1;
1218         vp->v_flag = VAGE0 | VAGE1;
1219         vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1220
1221         KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1222         /* exclusive lock still held */
1223
1224         vp->v_filesize = NOOFFSET;
1225         vp->v_type = VNON;
1226         vp->v_tag = 0;
1227         vp->v_state = VS_CACHED;
1228         _vactivate(vp);
1229
1230         return (vp);
1231 }
1232
1233 /*
1234  * Called after a process has allocated a vnode via allocvnode()
1235  * and we detected that too many vnodes were present.
1236  *
1237  * This function is called just prior to a return to userland if the
1238  * process at some point had to allocate a new vnode during the last
1239  * system call and the vnode count was found to be excessive.
1240  *
1241  * This is a synchronous path that we do not normally want to execute.
1242  *
1243  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1244  *
1245  * WARNING: Sometimes numvnodes can blow out due to children being
1246  *          present under directory vnodes in the namecache.  For the
1247  *          moment use an if() instead of a while() and note that if
1248  *          we were to use a while() we would still have to break out
1249  *          if freesomevnodes() returned 0.  vnlru will also be trying
1250  *          hard to free vnodes at the same time (with a lower trigger
1251  *          pointer).
1252  */
1253 void
1254 allocvnode_gc(void)
1255 {
1256         if (numvnodes >= maxvnodes &&
1257             countcachedandinactivevnodes() >= maxvnodes * 5 / 10)
1258         {
1259                 freesomevnodes(batchfreevnodes);
1260         }
1261 }
1262
1263 int
1264 freesomevnodes(int n)
1265 {
1266         struct vnode *vp;
1267         int count = 0;
1268
1269         while (n) {
1270                 if ((vp = cleanfreevnode(n)) == NULL)
1271                         break;
1272                 vx_unlock(vp);
1273                 --n;
1274                 ++count;
1275                 kfree_obj(vp, M_VNODE);
1276                 atomic_add_int(&numvnodes, -1);
1277         }
1278         return(count);
1279 }