sys/kern/vfs_lock.c

   1 /*
   2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * External lock/ref-related vnode functions
  37  *
  38  * vs_state transition locking requirements:
  39  *
  40  *      INACTIVE -> CACHED|DYING        vx_lock(excl) + vi->spin
  41  *      DYING    -> CACHED              vx_lock(excl)
  42  *      ACTIVE   -> INACTIVE            (none)       + v_spin + vi->spin
  43  *      INACTIVE -> ACTIVE              vn_lock(any) + v_spin + vi->spin
  44  *      CACHED   -> ACTIVE              vn_lock(any) + v_spin + vi->spin
  45  *
  46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
  47  *
  48  *       Switching into ACTIVE also requires a vref and vnode lock, however
  49  *       the vnode lock is allowed to be SHARED.
  50  *
  51  *       Switching into a CACHED or DYING state requires an exclusive vnode
  52  *       lock or vx_lock (which is almost the same thing).
  53  */
  54
  55 #include <sys/param.h>
  56 #include <sys/systm.h>
  57 #include <sys/kernel.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mount.h>
  60 #include <sys/proc.h>
  61 #include <sys/vnode.h>
  62 #include <sys/buf.h>
  63 #include <sys/sysctl.h>
  64
  65 #include <machine/limits.h>
  66
  67 #include <vm/vm.h>
  68 #include <vm/vm_object.h>
  69
  70 #include <sys/buf2.h>
  71 #include <sys/thread2.h>
  72
  73 #define VACT_MAX        10
  74 #define VACT_INC        2
  75
  76 static void vnode_terminate(struct vnode *vp);
  77
  78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
  79
  80 /*
  81  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
  82  * are inserted prior to the mid point, and otherwise inserted
  83  * at the tail.
  84  *
  85  * The vnode code goes to great lengths to avoid moving vnodes between
  86  * lists, but sometimes it is unavoidable.  For this situation we try to
  87  * avoid lock contention but we do not try very hard to avoid cache line
  88  * congestion.  A modestly sized hash table is used.
  89  */
  90 #define VLIST_PRIME2    123462047LU
  91 #define VLIST_XOR       (uintptr_t)0xab4582fa8322fb71LLU
  92
  93 #define VLIST_HASH(vp)  (((uintptr_t)vp ^ VLIST_XOR) % \
  94                          VLIST_PRIME2 % (unsigned)ncpus)
  95
  96 TAILQ_HEAD(freelst, vnode);
  97
  98 struct vnode_index {
  99         struct freelst  active_list;
 100         struct vnode    active_rover;
 101         struct freelst  inactive_list;
 102         struct spinlock spin;
 103         int     deac_rover;
 104         int     free_rover;
 105 } __cachealign;
 106
 107 static struct vnode_index *vnode_list_hash;
 108
 109 int  activevnodes = 0;
 110 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
 111         &activevnodes, 0, "Number of active nodes");
 112 int  cachedvnodes = 0;
 113 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
 114         &cachedvnodes, 0, "Number of total cached nodes");
 115 int  inactivevnodes = 0;
 116 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
 117         &inactivevnodes, 0, "Number of inactive nodes");
 118 static int batchfreevnodes = 5;
 119 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
 120         &batchfreevnodes, 0, "Number of vnodes to free at once");
 121 #ifdef TRACKVNODE
 122 static u_long trackvnode;
 123 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
 124                 &trackvnode, 0, "");
 125 #endif
 126
 127 /*
 128  * Called from vfsinit()
 129  */
 130 void
 131 vfs_lock_init(void)
 132 {
 133         int i;
 134
 135         kmalloc_raise_limit(M_VNODE, 0);        /* unlimited */
 136         vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
 137                                   M_VNODE, M_ZERO | M_WAITOK);
 138         for (i = 0; i < ncpus; ++i) {
 139                 struct vnode_index *vi = &vnode_list_hash[i];
 140
 141                 TAILQ_INIT(&vi->inactive_list);
 142                 TAILQ_INIT(&vi->active_list);
 143                 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
 144                 spin_init(&vi->spin, "vfslock");
 145         }
 146 }
 147
 148 /*
 149  * Misc functions
 150  */
 151 static __inline
 152 void
 153 _vsetflags(struct vnode *vp, int flags)
 154 {
 155         atomic_set_int(&vp->v_flag, flags);
 156 }
 157
 158 static __inline
 159 void
 160 _vclrflags(struct vnode *vp, int flags)
 161 {
 162         atomic_clear_int(&vp->v_flag, flags);
 163 }
 164
 165 void
 166 vsetflags(struct vnode *vp, int flags)
 167 {
 168         _vsetflags(vp, flags);
 169 }
 170
 171 void
 172 vclrflags(struct vnode *vp, int flags)
 173 {
 174         _vclrflags(vp, flags);
 175 }
 176
 177 /*
 178  * Place the vnode on the active list.
 179  *
 180  * Caller must hold vp->v_spin
 181  */
 182 static __inline
 183 void
 184 _vactivate(struct vnode *vp)
 185 {
 186         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 187
 188 #ifdef TRACKVNODE
 189         if ((u_long)vp == trackvnode)
 190                 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
 191 #endif
 192         spin_lock(&vi->spin);
 193
 194         switch(vp->v_state) {
 195         case VS_ACTIVE:
 196                 spin_unlock(&vi->spin);
 197                 panic("_vactivate: already active");
 198                 /* NOT REACHED */
 199                 return;
 200         case VS_INACTIVE:
 201                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 202                 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
 203                 break;
 204         case VS_CACHED:
 205         case VS_DYING:
 206                 break;
 207         }
 208         TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
 209         vp->v_state = VS_ACTIVE;
 210         spin_unlock(&vi->spin);
 211         atomic_add_int(&mycpu->gd_activevnodes, 1);
 212 }
 213
 214 /*
 215  * Put a vnode on the inactive list.
 216  *
 217  * Caller must hold v_spin
 218  */
 219 static __inline
 220 void
 221 _vinactive(struct vnode *vp)
 222 {
 223         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 224
 225 #ifdef TRACKVNODE
 226         if ((u_long)vp == trackvnode) {
 227                 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
 228                 print_backtrace(-1);
 229         }
 230 #endif
 231         spin_lock(&vi->spin);
 232
 233         /*
 234          * Remove from active list if it is sitting on it
 235          */
 236         switch(vp->v_state) {
 237         case VS_ACTIVE:
 238                 TAILQ_REMOVE(&vi->active_list, vp, v_list);
 239                 atomic_add_int(&mycpu->gd_activevnodes, -1);
 240                 break;
 241         case VS_INACTIVE:
 242                 spin_unlock(&vi->spin);
 243                 panic("_vinactive: already inactive");
 244                 /* NOT REACHED */
 245                 return;
 246         case VS_CACHED:
 247         case VS_DYING:
 248                 break;
 249         }
 250
 251         /*
 252          * Distinguish between basically dead vnodes, vnodes with cached
 253          * data, and vnodes without cached data.  A rover will shift the
 254          * vnodes around as their cache status is lost.
 255          */
 256         if (vp->v_flag & VRECLAIMED) {
 257                 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
 258         } else {
 259                 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 260         }
 261         vp->v_state = VS_INACTIVE;
 262         spin_unlock(&vi->spin);
 263         atomic_add_int(&mycpu->gd_inactivevnodes, 1);
 264 }
 265
 266 static __inline
 267 void
 268 _vinactive_tail(struct vnode *vp)
 269 {
 270         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 271
 272         spin_lock(&vi->spin);
 273
 274         /*
 275          * Remove from active list if it is sitting on it
 276          */
 277         switch(vp->v_state) {
 278         case VS_ACTIVE:
 279                 TAILQ_REMOVE(&vi->active_list, vp, v_list);
 280                 atomic_add_int(&mycpu->gd_activevnodes, -1);
 281                 break;
 282         case VS_INACTIVE:
 283                 spin_unlock(&vi->spin);
 284                 panic("_vinactive_tail: already inactive");
 285                 /* NOT REACHED */
 286                 return;
 287         case VS_CACHED:
 288         case VS_DYING:
 289                 break;
 290         }
 291
 292         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 293         vp->v_state = VS_INACTIVE;
 294         spin_unlock(&vi->spin);
 295         atomic_add_int(&mycpu->gd_inactivevnodes, 1);
 296 }
 297
 298 /*
 299  * Add a ref to an active vnode.  This function should never be called
 300  * with an inactive vnode (use vget() instead), but might be called
 301  * with other states.
 302  */
 303 void
 304 vref(struct vnode *vp)
 305 {
 306         KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
 307                 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
 308         atomic_add_int(&vp->v_refcnt, 1);
 309 }
 310
 311 void
 312 synchronizevnodecount(void)
 313 {
 314         int nca = 0;
 315         int act = 0;
 316         int ina = 0;
 317         int i;
 318
 319         for (i = 0; i < ncpus; ++i) {
 320                 globaldata_t gd = globaldata_find(i);
 321                 nca += gd->gd_cachedvnodes;
 322                 act += gd->gd_activevnodes;
 323                 ina += gd->gd_inactivevnodes;
 324         }
 325         cachedvnodes = nca;
 326         activevnodes = act;
 327         inactivevnodes = ina;
 328 }
 329
 330 /*
 331  * Count number of cached vnodes.  This is middling expensive so be
 332  * careful not to make this call in the critical path.  Each cpu tracks
 333  * its own accumulator.  The individual accumulators must be summed
 334  * together to get an accurate value.
 335  */
 336 int
 337 countcachedvnodes(void)
 338 {
 339         int i;
 340         int n = 0;
 341
 342         for (i = 0; i < ncpus; ++i) {
 343                 globaldata_t gd = globaldata_find(i);
 344                 n += gd->gd_cachedvnodes;
 345         }
 346         return n;
 347 }
 348
 349 int
 350 countcachedandinactivevnodes(void)
 351 {
 352         int i;
 353         int n = 0;
 354
 355         for (i = 0; i < ncpus; ++i) {
 356                 globaldata_t gd = globaldata_find(i);
 357                 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
 358         }
 359         return n;
 360 }
 361
 362 /*
 363  * Release a ref on an active or inactive vnode.
 364  *
 365  * Caller has no other requirements.
 366  *
 367  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
 368  * transition, otherwise we leave the vnode in the active list and
 369  * do a lockless transition to 0, which is very important for the
 370  * critical path.
 371  *
 372  * (vrele() is not called when a vnode is being destroyed w/kfree)
 373  */
 374 void
 375 vrele(struct vnode *vp)
 376 {
 377         for (;;) {
 378                 int count = vp->v_refcnt;
 379                 cpu_ccfence();
 380                 KKASSERT((count & VREF_MASK) > 0);
 381                 KKASSERT(vp->v_state == VS_ACTIVE ||
 382                          vp->v_state == VS_INACTIVE);
 383
 384                 /*
 385                  * 2+ case
 386                  */
 387                 if ((count & VREF_MASK) > 1) {
 388                         if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
 389                                 break;
 390                         continue;
 391                 }
 392
 393                 /*
 394                  * 1->0 transition case must handle possible finalization.
 395                  * When finalizing we transition 1->0x40000000.  Note that
 396                  * cachedvnodes is only adjusted on transitions to ->0.
 397                  *
 398                  * WARNING! VREF_TERMINATE can be cleared at any point
 399                  *          when the refcnt is non-zero (by vget()) and
 400                  *          the vnode has not been reclaimed.  Thus
 401                  *          transitions out of VREF_TERMINATE do not have
 402                  *          to mess with cachedvnodes.
 403                  */
 404                 if (count & VREF_FINALIZE) {
 405                         vx_lock(vp);
 406                         if (atomic_cmpset_int(&vp->v_refcnt,
 407                                               count, VREF_TERMINATE)) {
 408                                 vnode_terminate(vp);
 409                                 break;
 410                         }
 411                         vx_unlock(vp);
 412                 } else {
 413                         if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
 414                                 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
 415                                 break;
 416                         }
 417                 }
 418                 /* retry */
 419         }
 420 }
 421
 422 /*
 423  * Add an auxiliary data structure reference to the vnode.  Auxiliary
 424  * references do not change the state of the vnode or prevent deactivation
 425  * or reclamation of the vnode, but will prevent the vnode from being
 426  * destroyed (kfree()'d).
 427  *
 428  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
 429  *           already be held by the caller.  vdrop() will clean up the
 430  *           free list state.
 431  */
 432 void
 433 vhold(struct vnode *vp)
 434 {
 435         atomic_add_int(&vp->v_auxrefs, 1);
 436 }
 437
 438 /*
 439  * Remove an auxiliary reference from the vnode.
 440  */
 441 void
 442 vdrop(struct vnode *vp)
 443 {
 444         atomic_add_int(&vp->v_auxrefs, -1);
 445 }
 446
 447 /*
 448  * This function is called on the 1->0 transition (which is actually
 449  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
 450  * of the vnode.
 451  *
 452  * Additional vrefs are allowed to race but will not result in a reentrant
 453  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
 454  * prevents additional 1->0 transitions.
 455  *
 456  * ONLY A VGET() CAN REACTIVATE THE VNODE.
 457  *
 458  * Caller must hold the VX lock.
 459  *
 460  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
 461  *
 462  * NOTE: The vnode may be marked inactive with dirty buffers
 463  *       or dirty pages in its cached VM object still present.
 464  *
 465  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
 466  *       previously be active).  We lose control of the vnode the instant
 467  *       it is placed on the free list.
 468  *
 469  *       The VX lock is required when transitioning to VS_CACHED but is
 470  *       not sufficient for the vshouldfree() interlocked test or when
 471  *       transitioning away from VS_CACHED.  v_spin is also required for
 472  *       those cases.
 473  */
 474 static
 475 void
 476 vnode_terminate(struct vnode *vp)
 477 {
 478         KKASSERT(vp->v_state == VS_ACTIVE);
 479
 480         if ((vp->v_flag & VINACTIVE) == 0) {
 481                 _vsetflags(vp, VINACTIVE);
 482                 if (vp->v_mount)
 483                         VOP_INACTIVE(vp);
 484         }
 485         spin_lock(&vp->v_spin);
 486         _vinactive(vp);
 487         spin_unlock(&vp->v_spin);
 488
 489         vx_unlock(vp);
 490 }
 491
 492 /****************************************************************
 493  *                      VX LOCKING FUNCTIONS                    *
 494  ****************************************************************
 495  *
 496  * These functions lock vnodes for reclamation and deactivation related
 497  * activities.  The caller must already be holding some sort of reference
 498  * on the vnode.
 499  */
 500 void
 501 vx_lock(struct vnode *vp)
 502 {
 503         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 504 }
 505
 506 void
 507 vx_unlock(struct vnode *vp)
 508 {
 509         lockmgr(&vp->v_lock, LK_RELEASE);
 510 }
 511
 512 /****************************************************************
 513  *                      VNODE ACQUISITION FUNCTIONS             *
 514  ****************************************************************
 515  *
 516  * These functions must be used when accessing a vnode that has no
 517  * chance of being destroyed in a SMP race.  That means the caller will
 518  * usually either hold an auxiliary reference (such as the namecache)
 519  * or hold some other lock that ensures that the vnode cannot be destroyed.
 520  *
 521  * These functions are MANDATORY for any code chain accessing a vnode
 522  * whos activation state is not known.
 523  *
 524  * vget() can be called with LK_NOWAIT and will return EBUSY if the
 525  * lock cannot be immediately acquired.
 526  *
 527  * vget()/vput() are used when reactivation is desired.
 528  *
 529  * vx_get() and vx_put() are used when reactivation is not desired.
 530  */
 531 int
 532 vget(struct vnode *vp, int flags)
 533 {
 534         int error;
 535
 536         /*
 537          * A lock type must be passed
 538          */
 539         if ((flags & LK_TYPE_MASK) == 0) {
 540                 panic("vget() called with no lock specified!");
 541                 /* NOT REACHED */
 542         }
 543
 544         /*
 545          * Reference the structure and then acquire the lock.
 546          *
 547          * NOTE: The requested lock might be a shared lock and does
 548          *       not protect our access to the refcnt or other fields.
 549          */
 550         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 551                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 552
 553         if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
 554                 /*
 555                  * The lock failed, undo and return an error.  This will not
 556                  * normally trigger a termination.
 557                  */
 558                 vrele(vp);
 559         } else if (vp->v_flag & VRECLAIMED) {
 560                 /*
 561                  * The node is being reclaimed and cannot be reactivated
 562                  * any more, undo and return ENOENT.
 563                  */
 564                 vn_unlock(vp);
 565                 vrele(vp);
 566                 error = ENOENT;
 567         } else if (vp->v_state == VS_ACTIVE) {
 568                 /*
 569                  * A VS_ACTIVE vnode coupled with the fact that we have
 570                  * a vnode lock (even if shared) prevents v_state from
 571                  * changing.  Since the vnode is not in a VRECLAIMED state,
 572                  * we can safely clear VINACTIVE.
 573                  *
 574                  * NOTE! Multiple threads may clear VINACTIVE if this is
 575                  *       shared lock.  This race is allowed.
 576                  */
 577                 _vclrflags(vp, VINACTIVE);      /* SMP race ok */
 578                 vp->v_act += VACT_INC;
 579                 if (vp->v_act > VACT_MAX)       /* SMP race ok */
 580                         vp->v_act = VACT_MAX;
 581                 error = 0;
 582         } else {
 583                 /*
 584                  * If the vnode is not VS_ACTIVE it must be reactivated
 585                  * in addition to clearing VINACTIVE.  An exclusive spin_lock
 586                  * is needed to manipulate the vnode's list.
 587                  *
 588                  * Because the lockmgr lock might be shared, we might race
 589                  * another reactivation, which we handle.  In this situation,
 590                  * however, the refcnt prevents other v_state races.
 591                  *
 592                  * As with above, clearing VINACTIVE is allowed to race other
 593                  * clearings of VINACTIVE.
 594                  *
 595                  * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
 596                  * the refcnt is non-zero and the vnode has not been
 597                  * reclaimed.  This also means that the transitions do
 598                  * not affect cachedvnodes.
 599                  */
 600                 _vclrflags(vp, VINACTIVE);
 601                 vp->v_act += VACT_INC;
 602                 if (vp->v_act > VACT_MAX)       /* SMP race ok */
 603                         vp->v_act = VACT_MAX;
 604                 spin_lock(&vp->v_spin);
 605
 606                 switch(vp->v_state) {
 607                 case VS_INACTIVE:
 608                         _vactivate(vp);
 609                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
 610                                                         VREF_FINALIZE);
 611                         spin_unlock(&vp->v_spin);
 612                         break;
 613                 case VS_CACHED:
 614                         _vactivate(vp);
 615                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
 616                                                         VREF_FINALIZE);
 617                         spin_unlock(&vp->v_spin);
 618                         break;
 619                 case VS_ACTIVE:
 620                         atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE);
 621                         spin_unlock(&vp->v_spin);
 622                         break;
 623                 case VS_DYING:
 624                         spin_unlock(&vp->v_spin);
 625                         panic("Impossible VS_DYING state");
 626                         break;
 627                 }
 628                 error = 0;
 629         }
 630         return(error);
 631 }
 632
 633 #ifdef DEBUG_VPUT
 634
 635 void
 636 debug_vput(struct vnode *vp, const char *filename, int line)
 637 {
 638         kprintf("vput(%p) %s:%d\n", vp, filename, line);
 639         vn_unlock(vp);
 640         vrele(vp);
 641 }
 642
 643 #else
 644
 645 void
 646 vput(struct vnode *vp)
 647 {
 648         vn_unlock(vp);
 649         vrele(vp);
 650 }
 651
 652 #endif
 653
 654 /*
 655  * Acquire the vnode lock unguarded.
 656  *
 657  * The non-blocking version also uses a slightly different mechanic.
 658  * This function will explicitly fail not only if it cannot acquire
 659  * the lock normally, but also if the caller already holds a lock.
 660  *
 661  * The adjusted mechanic is used to close a loophole where complex
 662  * VOP_RECLAIM code can circle around recursively and allocate the
 663  * same vnode it is trying to destroy from the freelist.
 664  *
 665  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
 666  * cause the incorrect behavior to occur.  If not for that lockmgr()
 667  * would do the right thing.
 668  *
 669  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
 670  */
 671 void
 672 vx_get(struct vnode *vp)
 673 {
 674         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 675                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 676         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 677 }
 678
 679 int
 680 vx_get_nonblock(struct vnode *vp)
 681 {
 682         int error;
 683
 684         if (lockinuse(&vp->v_lock))
 685                 return(EBUSY);
 686         error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
 687         if (error == 0) {
 688                 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 689                         atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 690         }
 691         return(error);
 692 }
 693
 694 /*
 695  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
 696  * any needed state transitions.
 697  *
 698  * However, filesystems use this function to get rid of unwanted new vnodes
 699  * so try to get the vnode on the correct queue in that case.
 700  */
 701 void
 702 vx_put(struct vnode *vp)
 703 {
 704         if (vp->v_type == VNON || vp->v_type == VBAD)
 705                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 706         lockmgr(&vp->v_lock, LK_RELEASE);
 707         vrele(vp);
 708 }
 709
 710 /*
 711  * Try to reuse a vnode from the free list.  This function is somewhat
 712  * advisory in that NULL can be returned as a normal case, even if free
 713  * vnodes are present.
 714  *
 715  * The scan is limited because it can result in excessive CPU use during
 716  * periods of extreme vnode use.
 717  *
 718  * NOTE: The returned vnode is not completely initialized.
 719  */
 720 static
 721 struct vnode *
 722 cleanfreevnode(int maxcount)
 723 {
 724         struct vnode_index *vi;
 725         struct vnode *vp;
 726         int count;
 727         int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
 728         int ri;
 729         int cpu_count;
 730
 731         /*
 732          * Try to deactivate some vnodes cached on the active list.
 733          */
 734         if (countcachedvnodes() < inactivevnodes)
 735                 goto skip;
 736
 737         ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
 738
 739         for (count = 0; count < maxcount * 2; ++count, ++ri) {
 740                 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
 741
 742                 spin_lock(&vi->spin);
 743
 744                 vp = TAILQ_NEXT(&vi->active_rover, v_list);
 745                 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
 746                 if (vp == NULL) {
 747                         TAILQ_INSERT_HEAD(&vi->active_list,
 748                                           &vi->active_rover, v_list);
 749                 } else {
 750                         TAILQ_INSERT_AFTER(&vi->active_list, vp,
 751                                            &vi->active_rover, v_list);
 752                 }
 753                 if (vp == NULL) {
 754                         spin_unlock(&vi->spin);
 755                         continue;
 756                 }
 757                 if ((vp->v_refcnt & VREF_MASK) != 0) {
 758                         spin_unlock(&vi->spin);
 759                         vp->v_act += VACT_INC;
 760                         if (vp->v_act > VACT_MAX)       /* SMP race ok */
 761                                 vp->v_act = VACT_MAX;
 762                         continue;
 763                 }
 764
 765                 /*
 766                  * decrement by less if the vnode's object has a lot of
 767                  * VM pages.  XXX possible SMP races.
 768                  */
 769                 if (vp->v_act > 0) {
 770                         vm_object_t obj;
 771                         if ((obj = vp->v_object) != NULL &&
 772                             obj->resident_page_count >= trigger) {
 773                                 vp->v_act -= 1;
 774                         } else {
 775                                 vp->v_act -= VACT_INC;
 776                         }
 777                         if (vp->v_act < 0)
 778                                 vp->v_act = 0;
 779                         spin_unlock(&vi->spin);
 780                         continue;
 781                 }
 782
 783                 /*
 784                  * Try to deactivate the vnode.
 785                  */
 786                 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 787                         atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 788                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 789
 790                 spin_unlock(&vi->spin);
 791                 vrele(vp);
 792         }
 793
 794         vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
 795
 796 skip:
 797         /*
 798          * Loop trying to lock the first vnode on the free list.
 799          * Cycle if we can't.
 800          */
 801         cpu_count = ncpus;
 802         ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
 803
 804         for (count = 0; count < maxcount; ++count, ++ri) {
 805                 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
 806
 807                 spin_lock(&vi->spin);
 808
 809                 vp = TAILQ_FIRST(&vi->inactive_list);
 810                 if (vp == NULL) {
 811                         spin_unlock(&vi->spin);
 812                         if (--cpu_count == 0)
 813                                 break;
 814                         ri = (ri + 16) & ~15;
 815                         --ri;
 816                         continue;
 817                 }
 818
 819                 /*
 820                  * non-blocking vx_get will also ref the vnode on success.
 821                  */
 822                 if (vx_get_nonblock(vp)) {
 823                         KKASSERT(vp->v_state == VS_INACTIVE);
 824                         TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 825                         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 826                         spin_unlock(&vi->spin);
 827                         continue;
 828                 }
 829
 830                 /*
 831                  * Because we are holding vfs_spin the vnode should currently
 832                  * be inactive and VREF_TERMINATE should still be set.
 833                  *
 834                  * Once vfs_spin is released the vnode's state should remain
 835                  * unmodified due to both the lock and ref on it.
 836                  */
 837                 KKASSERT(vp->v_state == VS_INACTIVE);
 838                 spin_unlock(&vi->spin);
 839 #ifdef TRACKVNODE
 840                 if ((u_long)vp == trackvnode)
 841                         kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
 842 #endif
 843
 844                 /*
 845                  * Do not reclaim/reuse a vnode while auxillary refs exists.
 846                  * This includes namecache refs due to a related ncp being
 847                  * locked or having children, a VM object association, or
 848                  * other hold users.
 849                  *
 850                  * Do not reclaim/reuse a vnode if someone else has a real
 851                  * ref on it.  This can occur if a filesystem temporarily
 852                  * releases the vnode lock during VOP_RECLAIM.
 853                  */
 854                 if (vp->v_auxrefs ||
 855                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
 856 failed:
 857                         if (vp->v_state == VS_INACTIVE) {
 858                                 spin_lock(&vi->spin);
 859                                 if (vp->v_state == VS_INACTIVE) {
 860                                         TAILQ_REMOVE(&vi->inactive_list,
 861                                                      vp, v_list);
 862                                         TAILQ_INSERT_TAIL(&vi->inactive_list,
 863                                                           vp, v_list);
 864                                 }
 865                                 spin_unlock(&vi->spin);
 866                         }
 867                         vx_put(vp);
 868                         continue;
 869                 }
 870
 871                 /*
 872                  * VINACTIVE and VREF_TERMINATE are expected to both be set
 873                  * for vnodes pulled from the inactive list, and cannot be
 874                  * changed while we hold the vx lock.
 875                  *
 876                  * Try to reclaim the vnode.
 877                  */
 878                 KKASSERT(vp->v_flag & VINACTIVE);
 879                 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
 880
 881                 if ((vp->v_flag & VRECLAIMED) == 0) {
 882                         if (cache_inval_vp_nonblock(vp))
 883                                 goto failed;
 884                         vgone_vxlocked(vp);
 885                         /* vnode is still VX locked */
 886                 }
 887
 888                 /*
 889                  * At this point if there are no other refs or auxrefs on
 890                  * the vnode with the inactive list locked, and we remove
 891                  * the vnode from the inactive list, it should not be
 892                  * possible for anyone else to access the vnode any more.
 893                  *
 894                  * Since the vnode is in a VRECLAIMED state, no new
 895                  * namecache associations could have been made and the
 896                  * vnode should have already been removed from its mountlist.
 897                  *
 898                  * Since we hold a VX lock on the vnode it cannot have been
 899                  * reactivated (moved out of the inactive list).
 900                  */
 901                 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
 902                 spin_lock(&vi->spin);
 903                 if (vp->v_auxrefs ||
 904                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
 905                         spin_unlock(&vi->spin);
 906                         goto failed;
 907                 }
 908                 KKASSERT(vp->v_state == VS_INACTIVE);
 909                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 910                 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
 911                 vp->v_state = VS_DYING;
 912                 spin_unlock(&vi->spin);
 913
 914                 /*
 915                  * Nothing should have been able to access this vp.  Only
 916                  * our ref should remain now.
 917                  */
 918                 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
 919                 KASSERT(vp->v_refcnt == 1,
 920                         ("vp %p badrefs %08x", vp, vp->v_refcnt));
 921
 922                 /*
 923                  * Return a VX locked vnode suitable for reuse.
 924                  */
 925                 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
 926                 return(vp);
 927         }
 928         vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
 929         return(NULL);
 930 }
 931
 932 /*
 933  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
 934  *
 935  * All new vnodes set the VAGE flags.  An open() of the vnode will
 936  * decrement the (2-bit) flags.  Vnodes which are opened several times
 937  * are thus retained in the cache over vnodes which are merely stat()d.
 938  *
 939  * We attempt to reuse an already-recycled vnode from our pcpu inactive
 940  * queue first, and allocate otherwise.  Attempting to recycle inactive
 941  * vnodes here can lead to numerous deadlocks, particularly with
 942  * softupdates.
 943  */
 944 struct vnode *
 945 allocvnode(int lktimeout, int lkflags)
 946 {
 947         struct vnode *vp;
 948         struct vnode_index *vi;
 949
 950         /*
 951          * lktimeout only applies when LK_TIMELOCK is used, and only
 952          * the pageout daemon uses it.  The timeout may not be zero
 953          * or the pageout daemon can deadlock in low-VM situations.
 954          */
 955         if (lktimeout == 0)
 956                 lktimeout = hz / 10;
 957
 958         /*
 959          * Do not flag for synchronous recyclement unless there are enough
 960          * freeable vnodes to recycle and the number of vnodes has
 961          * significantly exceeded our target.  We want the normal vnlru
 962          * process to handle the cleaning (at 9/10's) before we are forced
 963          * to flag it here at 11/10's for userexit path processing.
 964          */
 965         if (numvnodes >= maxvnodes * 11 / 10 &&
 966             cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
 967                 struct thread *td = curthread;
 968                 if (td->td_lwp)
 969                         atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
 970         }
 971
 972         /*
 973          * Try to trivially reuse a reclaimed vnode from the head of the
 974          * inactive list for this cpu.  Any vnode cycling which occurs
 975          * which terminates the vnode will cause it to be returned to the
 976          * same pcpu structure (e.g. unlink calls).
 977          */
 978         vi = &vnode_list_hash[mycpuid];
 979         spin_lock(&vi->spin);
 980
 981         vp = TAILQ_FIRST(&vi->inactive_list);
 982         if (vp && (vp->v_flag & VRECLAIMED)) {
 983                 /*
 984                  * non-blocking vx_get will also ref the vnode on success.
 985                  */
 986                 if (vx_get_nonblock(vp)) {
 987                         KKASSERT(vp->v_state == VS_INACTIVE);
 988                         TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 989                         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 990                         spin_unlock(&vi->spin);
 991                         goto slower;
 992                 }
 993
 994                 /*
 995                  * Because we are holding vfs_spin the vnode should currently
 996                  * be inactive and VREF_TERMINATE should still be set.
 997                  *
 998                  * Once vfs_spin is released the vnode's state should remain
 999                  * unmodified due to both the lock and ref on it.
1000                  */
1001                 KKASSERT(vp->v_state == VS_INACTIVE);
1002 #ifdef TRACKVNODE
1003                 if ((u_long)vp == trackvnode)
1004                         kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1005 #endif
1006
1007                 /*
1008                  * Do not reclaim/reuse a vnode while auxillary refs exists.
1009                  * This includes namecache refs due to a related ncp being
1010                  * locked or having children, a VM object association, or
1011                  * other hold users.
1012                  *
1013                  * Do not reclaim/reuse a vnode if someone else has a real
1014                  * ref on it.  This can occur if a filesystem temporarily
1015                  * releases the vnode lock during VOP_RECLAIM.
1016                  */
1017                 if (vp->v_auxrefs ||
1018                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1019                         if (vp->v_state == VS_INACTIVE) {
1020                                 if (vp->v_state == VS_INACTIVE) {
1021                                         TAILQ_REMOVE(&vi->inactive_list,
1022                                                      vp, v_list);
1023                                         TAILQ_INSERT_TAIL(&vi->inactive_list,
1024                                                           vp, v_list);
1025                                 }
1026                         }
1027                         spin_unlock(&vi->spin);
1028                         vx_put(vp);
1029                         goto slower;
1030                 }
1031
1032                 /*
1033                  * VINACTIVE and VREF_TERMINATE are expected to both be set
1034                  * for vnodes pulled from the inactive list, and cannot be
1035                  * changed while we hold the vx lock.
1036                  *
1037                  * Try to reclaim the vnode.
1038                  */
1039                 KKASSERT(vp->v_flag & VINACTIVE);
1040                 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1041
1042                 if ((vp->v_flag & VRECLAIMED) == 0) {
1043                         spin_unlock(&vi->spin);
1044                         vx_put(vp);
1045                         goto slower;
1046                 }
1047
1048                 /*
1049                  * At this point if there are no other refs or auxrefs on
1050                  * the vnode with the inactive list locked, and we remove
1051                  * the vnode from the inactive list, it should not be
1052                  * possible for anyone else to access the vnode any more.
1053                  *
1054                  * Since the vnode is in a VRECLAIMED state, no new
1055                  * namecache associations could have been made and the
1056                  * vnode should have already been removed from its mountlist.
1057                  *
1058                  * Since we hold a VX lock on the vnode it cannot have been
1059                  * reactivated (moved out of the inactive list).
1060                  */
1061                 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1062                 KKASSERT(vp->v_state == VS_INACTIVE);
1063                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1064                 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1065                 vp->v_state = VS_DYING;
1066                 spin_unlock(&vi->spin);
1067
1068                 /*
1069                  * Nothing should have been able to access this vp.  Only
1070                  * our ref should remain now.
1071                  *
1072                  * At this point we can kfree() the vnode if we want to.
1073                  * Instead, we reuse it for the allocation.
1074                  */
1075                 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1076                 KASSERT(vp->v_refcnt == 1,
1077                         ("vp %p badrefs %08x", vp, vp->v_refcnt));
1078                 bzero(vp, sizeof(*vp));
1079         } else {
1080                 spin_unlock(&vi->spin);
1081 slower:
1082                 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1083                 atomic_add_int(&numvnodes, 1);
1084         }
1085
1086         lwkt_token_init(&vp->v_token, "vnode");
1087         lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1088         TAILQ_INIT(&vp->v_namecache);
1089         RB_INIT(&vp->v_rbclean_tree);
1090         RB_INIT(&vp->v_rbdirty_tree);
1091         RB_INIT(&vp->v_rbhash_tree);
1092         spin_init(&vp->v_spin, "allocvnode");
1093
1094         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
1095         vp->v_refcnt = 1;
1096         vp->v_flag = VAGE0 | VAGE1;
1097         vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1098
1099         KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1100         /* exclusive lock still held */
1101
1102         vp->v_filesize = NOOFFSET;
1103         vp->v_type = VNON;
1104         vp->v_tag = 0;
1105         vp->v_state = VS_CACHED;
1106         _vactivate(vp);
1107
1108         return (vp);
1109 }
1110
1111 /*
1112  * Called after a process has allocated a vnode via allocvnode()
1113  * and we detected that too many vnodes were present.
1114  *
1115  * This function is called just prior to a return to userland if the
1116  * process at some point had to allocate a new vnode during the last
1117  * system call and the vnode count was found to be excessive.
1118  *
1119  * This is a synchronous path that we do not normally want to execute.
1120  *
1121  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1122  *
1123  * WARNING: Sometimes numvnodes can blow out due to children being
1124  *          present under directory vnodes in the namecache.  For the
1125  *          moment use an if() instead of a while() and note that if
1126  *          we were to use a while() we would still have to break out
1127  *          if freesomevnodes() returned 0.  vnlru will also be trying
1128  *          hard to free vnodes at the same time (with a lower trigger
1129  *          pointer).
1130  */
1131 void
1132 allocvnode_gc(void)
1133 {
1134         if (numvnodes >= maxvnodes &&
1135             countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1136                 freesomevnodes(batchfreevnodes);
1137         }
1138 }
1139
1140 int
1141 freesomevnodes(int n)
1142 {
1143         struct vnode *vp;
1144         int count = 0;
1145
1146         while (n) {
1147                 if ((vp = cleanfreevnode(n)) == NULL)
1148                         break;
1149                 vx_unlock(vp);
1150                 --n;
1151                 ++count;
1152                 kfree(vp, M_VNODE);
1153                 atomic_add_int(&numvnodes, -1);
1154         }
1155         return(count);
1156 }