sys/kern/vfs_lock.c

   1 /*
   2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * External lock/ref-related vnode functions
  37  *
  38  * vs_state transition locking requirements:
  39  *
  40  *      INACTIVE -> CACHED|DYING        vx_lock(excl) + vi->spin
  41  *      DYING    -> CACHED              vx_lock(excl)
  42  *      ACTIVE   -> INACTIVE            (none)       + v_spin + vi->spin
  43  *      INACTIVE -> ACTIVE              vn_lock(any) + v_spin + vi->spin
  44  *      CACHED   -> ACTIVE              vn_lock(any) + v_spin + vi->spin
  45  *
  46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
  47  *
  48  *       Switching into ACTIVE also requires a vref and vnode lock, however
  49  *       the vnode lock is allowed to be SHARED.
  50  *
  51  *       Switching into a CACHED or DYING state requires an exclusive vnode
  52  *       lock or vx_lock (which is almost the same thing).
  53  */
  54
  55 #include <sys/param.h>
  56 #include <sys/systm.h>
  57 #include <sys/kernel.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mount.h>
  60 #include <sys/proc.h>
  61 #include <sys/vnode.h>
  62 #include <sys/buf.h>
  63 #include <sys/sysctl.h>
  64
  65 #include <machine/limits.h>
  66
  67 #include <vm/vm.h>
  68 #include <vm/vm_object.h>
  69
  70 #include <sys/buf2.h>
  71 #include <sys/thread2.h>
  72
  73 #define VACT_MAX        10
  74 #define VACT_INC        2
  75
  76 static void vnode_terminate(struct vnode *vp);
  77
  78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
  79
  80 /*
  81  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
  82  * are inserted prior to the mid point, and otherwise inserted
  83  * at the tail.
  84  *
  85  * The vnode code goes to great lengths to avoid moving vnodes between
  86  * lists, but sometimes it is unavoidable.  For this situation we try to
  87  * avoid lock contention but we do not try very hard to avoid cache line
  88  * congestion.  A modestly sized hash table is used.
  89  */
  90 #define VLIST_PRIME2    123462047LU
  91 #define VLIST_XOR       (uintptr_t)0xab4582fa8322fb71LLU
  92
  93 #define VLIST_HASH(vp)  (((uintptr_t)vp ^ VLIST_XOR) % \
  94                          VLIST_PRIME2 % (unsigned)ncpus)
  95
  96 TAILQ_HEAD(freelst, vnode);
  97
  98 struct vnode_index {
  99         struct freelst  active_list;
 100         struct vnode    active_rover;
 101         struct freelst  inactive_list;
 102         struct spinlock spin;
 103         int     deac_rover;
 104         int     free_rover;
 105 } __cachealign;
 106
 107 static struct vnode_index *vnode_list_hash;
 108
 109 int  activevnodes = 0;
 110 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
 111         &activevnodes, 0, "Number of active nodes");
 112 int  cachedvnodes = 0;
 113 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
 114         &cachedvnodes, 0, "Number of total cached nodes");
 115 int  inactivevnodes = 0;
 116 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
 117         &inactivevnodes, 0, "Number of inactive nodes");
 118 static int batchfreevnodes = 5;
 119 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
 120         &batchfreevnodes, 0, "Number of vnodes to free at once");
 121 #ifdef TRACKVNODE
 122 static u_long trackvnode;
 123 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
 124                 &trackvnode, 0, "");
 125 #endif
 126
 127 /*
 128  * Called from vfsinit()
 129  */
 130 void
 131 vfs_lock_init(void)
 132 {
 133         int i;
 134
 135         kmalloc_raise_limit(M_VNODE, 0);        /* unlimited */
 136         vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
 137                                   M_VNODE, M_ZERO | M_WAITOK);
 138         for (i = 0; i < ncpus; ++i) {
 139                 struct vnode_index *vi = &vnode_list_hash[i];
 140
 141                 TAILQ_INIT(&vi->inactive_list);
 142                 TAILQ_INIT(&vi->active_list);
 143                 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
 144                 spin_init(&vi->spin, "vfslock");
 145         }
 146 }
 147
 148 /*
 149  * Misc functions
 150  */
 151 static __inline
 152 void
 153 _vsetflags(struct vnode *vp, int flags)
 154 {
 155         atomic_set_int(&vp->v_flag, flags);
 156 }
 157
 158 static __inline
 159 void
 160 _vclrflags(struct vnode *vp, int flags)
 161 {
 162         atomic_clear_int(&vp->v_flag, flags);
 163 }
 164
 165 void
 166 vsetflags(struct vnode *vp, int flags)
 167 {
 168         _vsetflags(vp, flags);
 169 }
 170
 171 void
 172 vclrflags(struct vnode *vp, int flags)
 173 {
 174         _vclrflags(vp, flags);
 175 }
 176
 177 /*
 178  * Place the vnode on the active list.
 179  *
 180  * Caller must hold vp->v_spin
 181  */
 182 static __inline
 183 void
 184 _vactivate(struct vnode *vp)
 185 {
 186         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 187
 188 #ifdef TRACKVNODE
 189         if ((u_long)vp == trackvnode)
 190                 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
 191 #endif
 192         spin_lock(&vi->spin);
 193
 194         switch(vp->v_state) {
 195         case VS_ACTIVE:
 196                 spin_unlock(&vi->spin);
 197                 panic("_vactivate: already active");
 198                 /* NOT REACHED */
 199                 return;
 200         case VS_INACTIVE:
 201                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 202                 atomic_add_int(&inactivevnodes, -1);
 203                 break;
 204         case VS_CACHED:
 205         case VS_DYING:
 206                 break;
 207         }
 208         TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
 209         vp->v_state = VS_ACTIVE;
 210         spin_unlock(&vi->spin);
 211         atomic_add_int(&activevnodes, 1);
 212 }
 213
 214 /*
 215  * Put a vnode on the inactive list.
 216  *
 217  * Caller must hold v_spin
 218  */
 219 static __inline
 220 void
 221 _vinactive(struct vnode *vp)
 222 {
 223         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 224
 225 #ifdef TRACKVNODE
 226         if ((u_long)vp == trackvnode) {
 227                 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
 228                 print_backtrace(-1);
 229         }
 230 #endif
 231         spin_lock(&vi->spin);
 232
 233         /*
 234          * Remove from active list if it is sitting on it
 235          */
 236         switch(vp->v_state) {
 237         case VS_ACTIVE:
 238                 TAILQ_REMOVE(&vi->active_list, vp, v_list);
 239                 atomic_add_int(&activevnodes, -1);
 240                 break;
 241         case VS_INACTIVE:
 242                 spin_unlock(&vi->spin);
 243                 panic("_vinactive: already inactive");
 244                 /* NOT REACHED */
 245                 return;
 246         case VS_CACHED:
 247         case VS_DYING:
 248                 break;
 249         }
 250
 251         /*
 252          * Distinguish between basically dead vnodes, vnodes with cached
 253          * data, and vnodes without cached data.  A rover will shift the
 254          * vnodes around as their cache status is lost.
 255          */
 256         if (vp->v_flag & VRECLAIMED) {
 257                 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
 258         } else {
 259                 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 260         }
 261         vp->v_state = VS_INACTIVE;
 262         spin_unlock(&vi->spin);
 263         atomic_add_int(&inactivevnodes, 1);
 264 }
 265
 266 static __inline
 267 void
 268 _vinactive_tail(struct vnode *vp)
 269 {
 270         struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
 271
 272         spin_lock(&vi->spin);
 273
 274         /*
 275          * Remove from active list if it is sitting on it
 276          */
 277         switch(vp->v_state) {
 278         case VS_ACTIVE:
 279                 TAILQ_REMOVE(&vi->active_list, vp, v_list);
 280                 atomic_add_int(&activevnodes, -1);
 281                 break;
 282         case VS_INACTIVE:
 283                 spin_unlock(&vi->spin);
 284                 panic("_vinactive_tail: already inactive");
 285                 /* NOT REACHED */
 286                 return;
 287         case VS_CACHED:
 288         case VS_DYING:
 289                 break;
 290         }
 291
 292         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 293         vp->v_state = VS_INACTIVE;
 294         spin_unlock(&vi->spin);
 295         atomic_add_int(&inactivevnodes, 1);
 296 }
 297
 298 /*
 299  * Add a ref to an active vnode.  This function should never be called
 300  * with an inactive vnode (use vget() instead), but might be called
 301  * with other states.
 302  */
 303 void
 304 vref(struct vnode *vp)
 305 {
 306         KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
 307                 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
 308         atomic_add_int(&vp->v_refcnt, 1);
 309 }
 310
 311 /*
 312  * Count number of cached vnodes.  This is middling expensive so be
 313  * careful not to make this call in the critical path, particularly
 314  * not updating the global.  Each cpu tracks its own accumulator.
 315  * The individual accumulators are not accurate and must be summed
 316  * together.
 317  */
 318 int
 319 countcachedvnodes(int gupdate)
 320 {
 321         int i;
 322         int n = 0;
 323
 324         for (i = 0; i < ncpus; ++i) {
 325                 globaldata_t gd = globaldata_find(i);
 326                 n += gd->gd_cachedvnodes;
 327         }
 328         if (gupdate)
 329                 cachedvnodes = n;
 330         return n;
 331 }
 332
 333 /*
 334  * Release a ref on an active or inactive vnode.
 335  *
 336  * Caller has no other requirements.
 337  *
 338  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
 339  * transition, otherwise we leave the vnode in the active list and
 340  * do a lockless transition to 0, which is very important for the
 341  * critical path.
 342  *
 343  * (vrele() is not called when a vnode is being destroyed w/kfree)
 344  */
 345 void
 346 vrele(struct vnode *vp)
 347 {
 348         for (;;) {
 349                 int count = vp->v_refcnt;
 350                 cpu_ccfence();
 351                 KKASSERT((count & VREF_MASK) > 0);
 352                 KKASSERT(vp->v_state == VS_ACTIVE ||
 353                          vp->v_state == VS_INACTIVE);
 354
 355                 /*
 356                  * 2+ case
 357                  */
 358                 if ((count & VREF_MASK) > 1) {
 359                         if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
 360                                 break;
 361                         continue;
 362                 }
 363
 364                 /*
 365                  * 1->0 transition case must handle possible finalization.
 366                  * When finalizing we transition 1->0x40000000.  Note that
 367                  * cachedvnodes is only adjusted on transitions to ->0.
 368                  *
 369                  * WARNING! VREF_TERMINATE can be cleared at any point
 370                  *          when the refcnt is non-zero (by vget()) and
 371                  *          the vnode has not been reclaimed.  Thus
 372                  *          transitions out of VREF_TERMINATE do not have
 373                  *          to mess with cachedvnodes.
 374                  */
 375                 if (count & VREF_FINALIZE) {
 376                         vx_lock(vp);
 377                         if (atomic_cmpset_int(&vp->v_refcnt,
 378                                               count, VREF_TERMINATE)) {
 379                                 vnode_terminate(vp);
 380                                 break;
 381                         }
 382                         vx_unlock(vp);
 383                 } else {
 384                         if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
 385                                 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
 386                                 break;
 387                         }
 388                 }
 389                 /* retry */
 390         }
 391 }
 392
 393 /*
 394  * Add an auxiliary data structure reference to the vnode.  Auxiliary
 395  * references do not change the state of the vnode or prevent deactivation
 396  * or reclamation of the vnode, but will prevent the vnode from being
 397  * destroyed (kfree()'d).
 398  *
 399  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
 400  *           already be held by the caller.  vdrop() will clean up the
 401  *           free list state.
 402  */
 403 void
 404 vhold(struct vnode *vp)
 405 {
 406         atomic_add_int(&vp->v_auxrefs, 1);
 407 }
 408
 409 /*
 410  * Remove an auxiliary reference from the vnode.
 411  */
 412 void
 413 vdrop(struct vnode *vp)
 414 {
 415         atomic_add_int(&vp->v_auxrefs, -1);
 416 }
 417
 418 /*
 419  * This function is called on the 1->0 transition (which is actually
 420  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
 421  * of the vnode.
 422  *
 423  * Additional vrefs are allowed to race but will not result in a reentrant
 424  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
 425  * prevents additional 1->0 transitions.
 426  *
 427  * ONLY A VGET() CAN REACTIVATE THE VNODE.
 428  *
 429  * Caller must hold the VX lock.
 430  *
 431  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
 432  *
 433  * NOTE: The vnode may be marked inactive with dirty buffers
 434  *       or dirty pages in its cached VM object still present.
 435  *
 436  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
 437  *       previously be active).  We lose control of the vnode the instant
 438  *       it is placed on the free list.
 439  *
 440  *       The VX lock is required when transitioning to VS_CACHED but is
 441  *       not sufficient for the vshouldfree() interlocked test or when
 442  *       transitioning away from VS_CACHED.  v_spin is also required for
 443  *       those cases.
 444  */
 445 static
 446 void
 447 vnode_terminate(struct vnode *vp)
 448 {
 449         KKASSERT(vp->v_state == VS_ACTIVE);
 450
 451         if ((vp->v_flag & VINACTIVE) == 0) {
 452                 _vsetflags(vp, VINACTIVE);
 453                 if (vp->v_mount)
 454                         VOP_INACTIVE(vp);
 455                 /* might deactivate page */
 456         }
 457         spin_lock(&vp->v_spin);
 458         _vinactive(vp);
 459         spin_unlock(&vp->v_spin);
 460
 461         vx_unlock(vp);
 462 }
 463
 464 /****************************************************************
 465  *                      VX LOCKING FUNCTIONS                    *
 466  ****************************************************************
 467  *
 468  * These functions lock vnodes for reclamation and deactivation related
 469  * activities.  The caller must already be holding some sort of reference
 470  * on the vnode.
 471  */
 472 void
 473 vx_lock(struct vnode *vp)
 474 {
 475         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 476 }
 477
 478 void
 479 vx_unlock(struct vnode *vp)
 480 {
 481         lockmgr(&vp->v_lock, LK_RELEASE);
 482 }
 483
 484 /****************************************************************
 485  *                      VNODE ACQUISITION FUNCTIONS             *
 486  ****************************************************************
 487  *
 488  * These functions must be used when accessing a vnode that has no
 489  * chance of being destroyed in a SMP race.  That means the caller will
 490  * usually either hold an auxiliary reference (such as the namecache)
 491  * or hold some other lock that ensures that the vnode cannot be destroyed.
 492  *
 493  * These functions are MANDATORY for any code chain accessing a vnode
 494  * whos activation state is not known.
 495  *
 496  * vget() can be called with LK_NOWAIT and will return EBUSY if the
 497  * lock cannot be immediately acquired.
 498  *
 499  * vget()/vput() are used when reactivation is desired.
 500  *
 501  * vx_get() and vx_put() are used when reactivation is not desired.
 502  */
 503 int
 504 vget(struct vnode *vp, int flags)
 505 {
 506         int error;
 507
 508         /*
 509          * A lock type must be passed
 510          */
 511         if ((flags & LK_TYPE_MASK) == 0) {
 512                 panic("vget() called with no lock specified!");
 513                 /* NOT REACHED */
 514         }
 515
 516         /*
 517          * Reference the structure and then acquire the lock.
 518          *
 519          * NOTE: The requested lock might be a shared lock and does
 520          *       not protect our access to the refcnt or other fields.
 521          */
 522         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 523                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 524
 525         if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
 526                 /*
 527                  * The lock failed, undo and return an error.  This will not
 528                  * normally trigger a termination.
 529                  */
 530                 vrele(vp);
 531         } else if (vp->v_flag & VRECLAIMED) {
 532                 /*
 533                  * The node is being reclaimed and cannot be reactivated
 534                  * any more, undo and return ENOENT.
 535                  */
 536                 vn_unlock(vp);
 537                 vrele(vp);
 538                 error = ENOENT;
 539         } else if (vp->v_state == VS_ACTIVE) {
 540                 /*
 541                  * A VS_ACTIVE vnode coupled with the fact that we have
 542                  * a vnode lock (even if shared) prevents v_state from
 543                  * changing.  Since the vnode is not in a VRECLAIMED state,
 544                  * we can safely clear VINACTIVE.
 545                  *
 546                  * NOTE! Multiple threads may clear VINACTIVE if this is
 547                  *       shared lock.  This race is allowed.
 548                  */
 549                 _vclrflags(vp, VINACTIVE);      /* SMP race ok */
 550                 vp->v_act += VACT_INC;
 551                 if (vp->v_act > VACT_MAX)       /* SMP race ok */
 552                         vp->v_act = VACT_MAX;
 553                 error = 0;
 554         } else {
 555                 /*
 556                  * If the vnode is not VS_ACTIVE it must be reactivated
 557                  * in addition to clearing VINACTIVE.  An exclusive spin_lock
 558                  * is needed to manipulate the vnode's list.
 559                  *
 560                  * Because the lockmgr lock might be shared, we might race
 561                  * another reactivation, which we handle.  In this situation,
 562                  * however, the refcnt prevents other v_state races.
 563                  *
 564                  * As with above, clearing VINACTIVE is allowed to race other
 565                  * clearings of VINACTIVE.
 566                  *
 567                  * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
 568                  * the refcnt is non-zero and the vnode has not been
 569                  * reclaimed.  This also means that the transitions do
 570                  * not affect cachedvnodes.
 571                  */
 572                 _vclrflags(vp, VINACTIVE);
 573                 vp->v_act += VACT_INC;
 574                 if (vp->v_act > VACT_MAX)       /* SMP race ok */
 575                         vp->v_act = VACT_MAX;
 576                 spin_lock(&vp->v_spin);
 577
 578                 switch(vp->v_state) {
 579                 case VS_INACTIVE:
 580                         _vactivate(vp);
 581                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
 582                                                         VREF_FINALIZE);
 583                         spin_unlock(&vp->v_spin);
 584                         break;
 585                 case VS_CACHED:
 586                         _vactivate(vp);
 587                         atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
 588                                                         VREF_FINALIZE);
 589                         spin_unlock(&vp->v_spin);
 590                         break;
 591                 case VS_ACTIVE:
 592                         atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE);
 593                         spin_unlock(&vp->v_spin);
 594                         break;
 595                 case VS_DYING:
 596                         spin_unlock(&vp->v_spin);
 597                         panic("Impossible VS_DYING state");
 598                         break;
 599                 }
 600                 error = 0;
 601         }
 602         return(error);
 603 }
 604
 605 #ifdef DEBUG_VPUT
 606
 607 void
 608 debug_vput(struct vnode *vp, const char *filename, int line)
 609 {
 610         kprintf("vput(%p) %s:%d\n", vp, filename, line);
 611         vn_unlock(vp);
 612         vrele(vp);
 613 }
 614
 615 #else
 616
 617 void
 618 vput(struct vnode *vp)
 619 {
 620         vn_unlock(vp);
 621         vrele(vp);
 622 }
 623
 624 #endif
 625
 626 /*
 627  * Acquire the vnode lock unguarded.
 628  *
 629  * The non-blocking version also uses a slightly different mechanic.
 630  * This function will explicitly fail not only if it cannot acquire
 631  * the lock normally, but also if the caller already holds a lock.
 632  *
 633  * The adjusted mechanic is used to close a loophole where complex
 634  * VOP_RECLAIM code can circle around recursively and allocate the
 635  * same vnode it is trying to destroy from the freelist.
 636  *
 637  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
 638  * cause the incorrect behavior to occur.  If not for that lockmgr()
 639  * would do the right thing.
 640  *
 641  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
 642  */
 643 void
 644 vx_get(struct vnode *vp)
 645 {
 646         if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 647                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 648         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 649 }
 650
 651 int
 652 vx_get_nonblock(struct vnode *vp)
 653 {
 654         int error;
 655
 656         if (lockinuse(&vp->v_lock))
 657                 return(EBUSY);
 658         error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
 659         if (error == 0) {
 660                 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 661                         atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 662         }
 663         return(error);
 664 }
 665
 666 /*
 667  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
 668  * any needed state transitions.
 669  *
 670  * However, filesystems use this function to get rid of unwanted new vnodes
 671  * so try to get the vnode on the correct queue in that case.
 672  */
 673 void
 674 vx_put(struct vnode *vp)
 675 {
 676         if (vp->v_type == VNON || vp->v_type == VBAD)
 677                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 678         lockmgr(&vp->v_lock, LK_RELEASE);
 679         vrele(vp);
 680 }
 681
 682 /*
 683  * Try to reuse a vnode from the free list.  This function is somewhat
 684  * advisory in that NULL can be returned as a normal case, even if free
 685  * vnodes are present.
 686  *
 687  * The scan is limited because it can result in excessive CPU use during
 688  * periods of extreme vnode use.
 689  *
 690  * NOTE: The returned vnode is not completely initialized.
 691  */
 692 static
 693 struct vnode *
 694 cleanfreevnode(int maxcount)
 695 {
 696         struct vnode_index *vi;
 697         struct vnode *vp;
 698         int count;
 699         int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
 700         int ri;
 701         int cpu_count;
 702
 703         /*
 704          * Try to deactivate some vnodes cached on the active list.
 705          */
 706         if (countcachedvnodes(0) < inactivevnodes)
 707                 goto skip;
 708
 709         ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
 710
 711         for (count = 0; count < maxcount * 2; ++count, ++ri) {
 712                 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
 713
 714                 spin_lock(&vi->spin);
 715
 716                 vp = TAILQ_NEXT(&vi->active_rover, v_list);
 717                 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
 718                 if (vp == NULL) {
 719                         TAILQ_INSERT_HEAD(&vi->active_list,
 720                                           &vi->active_rover, v_list);
 721                 } else {
 722                         TAILQ_INSERT_AFTER(&vi->active_list, vp,
 723                                            &vi->active_rover, v_list);
 724                 }
 725                 if (vp == NULL) {
 726                         spin_unlock(&vi->spin);
 727                         continue;
 728                 }
 729                 if ((vp->v_refcnt & VREF_MASK) != 0) {
 730                         spin_unlock(&vi->spin);
 731                         vp->v_act += VACT_INC;
 732                         if (vp->v_act > VACT_MAX)       /* SMP race ok */
 733                                 vp->v_act = VACT_MAX;
 734                         continue;
 735                 }
 736
 737                 /*
 738                  * decrement by less if the vnode's object has a lot of
 739                  * VM pages.  XXX possible SMP races.
 740                  */
 741                 if (vp->v_act > 0) {
 742                         vm_object_t obj;
 743                         if ((obj = vp->v_object) != NULL &&
 744                             obj->resident_page_count >= trigger) {
 745                                 vp->v_act -= 1;
 746                         } else {
 747                                 vp->v_act -= VACT_INC;
 748                         }
 749                         if (vp->v_act < 0)
 750                                 vp->v_act = 0;
 751                         spin_unlock(&vi->spin);
 752                         continue;
 753                 }
 754
 755                 /*
 756                  * Try to deactivate the vnode.
 757                  */
 758                 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
 759                         atomic_add_int(&mycpu->gd_cachedvnodes, -1);
 760                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
 761
 762                 spin_unlock(&vi->spin);
 763                 vrele(vp);
 764         }
 765
 766         vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
 767
 768 skip:
 769         /*
 770          * Loop trying to lock the first vnode on the free list.
 771          * Cycle if we can't.
 772          */
 773         cpu_count = ncpus;
 774         ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
 775
 776         for (count = 0; count < maxcount; ++count, ++ri) {
 777                 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
 778
 779                 spin_lock(&vi->spin);
 780
 781                 vp = TAILQ_FIRST(&vi->inactive_list);
 782                 if (vp == NULL) {
 783                         spin_unlock(&vi->spin);
 784                         if (--cpu_count == 0)
 785                                 break;
 786                         ri = (ri + 16) & ~15;
 787                         --ri;
 788                         continue;
 789                 }
 790
 791                 /*
 792                  * non-blocking vx_get will also ref the vnode on success.
 793                  */
 794                 if (vx_get_nonblock(vp)) {
 795                         KKASSERT(vp->v_state == VS_INACTIVE);
 796                         TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 797                         TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
 798                         spin_unlock(&vi->spin);
 799                         continue;
 800                 }
 801
 802                 /*
 803                  * Because we are holding vfs_spin the vnode should currently
 804                  * be inactive and VREF_TERMINATE should still be set.
 805                  *
 806                  * Once vfs_spin is released the vnode's state should remain
 807                  * unmodified due to both the lock and ref on it.
 808                  */
 809                 KKASSERT(vp->v_state == VS_INACTIVE);
 810                 spin_unlock(&vi->spin);
 811 #ifdef TRACKVNODE
 812                 if ((u_long)vp == trackvnode)
 813                         kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
 814 #endif
 815
 816                 /*
 817                  * Do not reclaim/reuse a vnode while auxillary refs exists.
 818                  * This includes namecache refs due to a related ncp being
 819                  * locked or having children, a VM object association, or
 820                  * other hold users.
 821                  *
 822                  * Do not reclaim/reuse a vnode if someone else has a real
 823                  * ref on it.  This can occur if a filesystem temporarily
 824                  * releases the vnode lock during VOP_RECLAIM.
 825                  */
 826                 if (vp->v_auxrefs ||
 827                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
 828 failed:
 829                         if (vp->v_state == VS_INACTIVE) {
 830                                 spin_lock(&vi->spin);
 831                                 if (vp->v_state == VS_INACTIVE) {
 832                                         TAILQ_REMOVE(&vi->inactive_list,
 833                                                      vp, v_list);
 834                                         TAILQ_INSERT_TAIL(&vi->inactive_list,
 835                                                           vp, v_list);
 836                                 }
 837                                 spin_unlock(&vi->spin);
 838                         }
 839                         vx_put(vp);
 840                         continue;
 841                 }
 842
 843                 /*
 844                  * VINACTIVE and VREF_TERMINATE are expected to both be set
 845                  * for vnodes pulled from the inactive list, and cannot be
 846                  * changed while we hold the vx lock.
 847                  *
 848                  * Try to reclaim the vnode.
 849                  */
 850                 KKASSERT(vp->v_flag & VINACTIVE);
 851                 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
 852
 853                 if ((vp->v_flag & VRECLAIMED) == 0) {
 854                         if (cache_inval_vp_nonblock(vp))
 855                                 goto failed;
 856                         vgone_vxlocked(vp);
 857                         /* vnode is still VX locked */
 858                 }
 859
 860                 /*
 861                  * At this point if there are no other refs or auxrefs on
 862                  * the vnode with the inactive list locked, and we remove
 863                  * the vnode from the inactive list, it should not be
 864                  * possible for anyone else to access the vnode any more.
 865                  *
 866                  * Since the vnode is in a VRECLAIMED state, no new
 867                  * namecache associations could have been made and the
 868                  * vnode should have already been removed from its mountlist.
 869                  *
 870                  * Since we hold a VX lock on the vnode it cannot have been
 871                  * reactivated (moved out of the inactive list).
 872                  */
 873                 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
 874                 spin_lock(&vi->spin);
 875                 if (vp->v_auxrefs ||
 876                     (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
 877                         spin_unlock(&vi->spin);
 878                         goto failed;
 879                 }
 880                 KKASSERT(vp->v_state == VS_INACTIVE);
 881                 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
 882                 atomic_add_int(&inactivevnodes, -1);
 883                 vp->v_state = VS_DYING;
 884                 spin_unlock(&vi->spin);
 885
 886                 /*
 887                  * Nothing should have been able to access this vp.  Only
 888                  * our ref should remain now.
 889                  */
 890                 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
 891                 KASSERT(vp->v_refcnt == 1,
 892                         ("vp %p badrefs %08x", vp, vp->v_refcnt));
 893
 894                 /*
 895                  * Return a VX locked vnode suitable for reuse.
 896                  */
 897                 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
 898                 return(vp);
 899         }
 900         vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
 901         return(NULL);
 902 }
 903
 904 /*
 905  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
 906  *
 907  * All new vnodes set the VAGE flags.  An open() of the vnode will
 908  * decrement the (2-bit) flags.  Vnodes which are opened several times
 909  * are thus retained in the cache over vnodes which are merely stat()d.
 910  *
 911  * We always allocate the vnode.  Attempting to recycle existing vnodes
 912  * here can lead to numerous deadlocks, particularly with softupdates.
 913  */
 914 struct vnode *
 915 allocvnode(int lktimeout, int lkflags)
 916 {
 917         struct vnode *vp;
 918
 919         /*
 920          * Do not flag for synchronous recyclement unless there are enough
 921          * freeable vnodes to recycle and the number of vnodes has
 922          * significantly exceeded our target.  We want the normal vnlru
 923          * process to handle the cleaning (at 9/10's) before we are forced
 924          * to flag it here at 11/10's for userexit path processing.
 925          */
 926         if (numvnodes >= maxvnodes * 11 / 10 &&
 927             cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
 928                 struct thread *td = curthread;
 929                 if (td->td_lwp)
 930                         atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
 931         }
 932
 933         /*
 934          * lktimeout only applies when LK_TIMELOCK is used, and only
 935          * the pageout daemon uses it.  The timeout may not be zero
 936          * or the pageout daemon can deadlock in low-VM situations.
 937          */
 938         if (lktimeout == 0)
 939                 lktimeout = hz / 10;
 940
 941         vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
 942
 943         lwkt_token_init(&vp->v_token, "vnode");
 944         lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
 945         TAILQ_INIT(&vp->v_namecache);
 946         RB_INIT(&vp->v_rbclean_tree);
 947         RB_INIT(&vp->v_rbdirty_tree);
 948         RB_INIT(&vp->v_rbhash_tree);
 949         spin_init(&vp->v_spin, "allocvnode");
 950
 951         lockmgr(&vp->v_lock, LK_EXCLUSIVE);
 952         atomic_add_int(&numvnodes, 1);
 953         vp->v_refcnt = 1;
 954         vp->v_flag = VAGE0 | VAGE1;
 955         vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
 956
 957         KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
 958         /* exclusive lock still held */
 959
 960         vp->v_filesize = NOOFFSET;
 961         vp->v_type = VNON;
 962         vp->v_tag = 0;
 963         vp->v_state = VS_CACHED;
 964         _vactivate(vp);
 965
 966         return (vp);
 967 }
 968
 969 /*
 970  * Called after a process has allocated a vnode via allocvnode()
 971  * and we detected that too many vnodes were present.
 972  *
 973  * This function is called just prior to a return to userland if the
 974  * process at some point had to allocate a new vnode during the last
 975  * system call and the vnode count was found to be excessive.
 976  *
 977  * This is a synchronous path that we do not normally want to execute.
 978  *
 979  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
 980  *
 981  * WARNING: Sometimes numvnodes can blow out due to children being
 982  *          present under directory vnodes in the namecache.  For the
 983  *          moment use an if() instead of a while() and note that if
 984  *          we were to use a while() we would still have to break out
 985  *          if freesomevnodes() returned 0.  vnlru will also be trying
 986  *          hard to free vnodes at the same time (with a lower trigger
 987  *          pointer).
 988  */
 989 void
 990 allocvnode_gc(void)
 991 {
 992         if (numvnodes >= maxvnodes &&
 993             countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) {
 994                 freesomevnodes(batchfreevnodes);
 995         }
 996 }
 997
 998 int
 999 freesomevnodes(int n)
1000 {
1001         struct vnode *vp;
1002         int count = 0;
1003
1004         while (n) {
1005                 if ((vp = cleanfreevnode(n)) == NULL)
1006                         break;
1007                 vx_unlock(vp);
1008                 --n;
1009                 ++count;
1010                 kfree(vp, M_VNODE);
1011                 atomic_add_int(&numvnodes, -1);
1012         }
1013         return(count);
1014 }