usr.sbin/makefs/ffs: Remove m_buf::b_is_hammer2
[dragonfly.git] / sys / kern / vfs_lock.c
blob06e7f32b9de22ab49114801e6d9fc485e514a145
1 /*
2 * Copyright (c) 2004,2013-2022 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * External lock/ref-related vnode functions
38 * vs_state transition locking requirements:
40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin
41 * DYING -> CACHED vx_lock(excl)
42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin
43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin
44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin
46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
48 * Switching into ACTIVE also requires a vref and vnode lock, however
49 * the vnode lock is allowed to be SHARED.
51 * Switching into a CACHED or DYING state requires an exclusive vnode
52 * lock or vx_lock (which is almost the same thing but not quite).
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/spinlock2.h>
63 #include <sys/sysctl.h>
65 #include <machine/limits.h>
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
70 #define VACT_MAX 10
71 #define VACT_INC 2
73 static void vnode_terminate(struct vnode *vp);
75 static MALLOC_DEFINE_OBJ(M_VNODE, sizeof(struct vnode), "vnodes", "vnodes");
76 static MALLOC_DEFINE(M_VNODE_HASH, "vnodelsthash", "vnode list hash");
79 * The vnode free list hold inactive vnodes. Aged inactive vnodes
80 * are inserted prior to the mid point, and otherwise inserted
81 * at the tail.
83 * The vnode code goes to great lengths to avoid moving vnodes between
84 * lists, but sometimes it is unavoidable. For this situation we try to
85 * avoid lock contention but we do not try very hard to avoid cache line
86 * congestion. A modestly sized hash table is used.
88 #define VLIST_PRIME2 123462047LU
89 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU
91 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \
92 VLIST_PRIME2 % (unsigned)ncpus)
94 static struct vnode_index *vnode_list_hash;
96 int activevnodes = 0;
97 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
98 &activevnodes, 0, "Number of active nodes");
99 int cachedvnodes = 0;
100 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
101 &cachedvnodes, 0, "Number of total cached nodes");
102 int inactivevnodes = 0;
103 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
104 &inactivevnodes, 0, "Number of inactive nodes");
105 static int batchfreevnodes = 5;
106 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
107 &batchfreevnodes, 0, "Number of vnodes to free at once");
109 static long auxrecovervnodes1;
110 SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes1, CTLFLAG_RW,
111 &auxrecovervnodes1, 0, "vnlru auxillary vnodes recovered");
112 static long auxrecovervnodes2;
113 SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes2, CTLFLAG_RW,
114 &auxrecovervnodes2, 0, "vnlru auxillary vnodes recovered");
116 #ifdef TRACKVNODE
117 static u_long trackvnode;
118 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
119 &trackvnode, 0, "");
120 #endif
123 * Called from vfsinit()
125 void
126 vfs_lock_init(void)
128 int i;
130 kmalloc_obj_raise_limit(M_VNODE, 0); /* unlimited */
131 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
132 M_VNODE_HASH, M_ZERO | M_WAITOK);
133 for (i = 0; i < ncpus; ++i) {
134 struct vnode_index *vi = &vnode_list_hash[i];
136 TAILQ_INIT(&vi->inactive_list);
137 TAILQ_INIT(&vi->active_list);
138 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
139 spin_init(&vi->spin, "vfslock");
144 * Misc functions
146 static __inline
147 void
148 _vsetflags(struct vnode *vp, int flags)
150 atomic_set_int(&vp->v_flag, flags);
153 static __inline
154 void
155 _vclrflags(struct vnode *vp, int flags)
157 atomic_clear_int(&vp->v_flag, flags);
160 void
161 vsetflags(struct vnode *vp, int flags)
163 _vsetflags(vp, flags);
166 void
167 vclrflags(struct vnode *vp, int flags)
169 _vclrflags(vp, flags);
173 * Place the vnode on the active list.
175 * Caller must hold vp->v_spin
177 static __inline
178 void
179 _vactivate(struct vnode *vp)
181 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
183 #ifdef TRACKVNODE
184 if ((u_long)vp == trackvnode)
185 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
186 #endif
187 spin_lock(&vi->spin);
189 switch(vp->v_state) {
190 case VS_ACTIVE:
191 spin_unlock(&vi->spin);
192 panic("_vactivate: already active");
193 /* NOT REACHED */
194 return;
195 case VS_INACTIVE:
196 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
197 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
198 break;
199 case VS_CACHED:
200 case VS_DYING:
201 break;
203 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
204 vp->v_state = VS_ACTIVE;
205 spin_unlock(&vi->spin);
206 atomic_add_int(&mycpu->gd_activevnodes, 1);
210 * Put a vnode on the inactive list.
212 * Caller must hold v_spin
214 static __inline
215 void
216 _vinactive(struct vnode *vp)
218 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
220 #ifdef TRACKVNODE
221 if ((u_long)vp == trackvnode) {
222 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
223 print_backtrace(-1);
225 #endif
226 spin_lock(&vi->spin);
229 * Remove from active list if it is sitting on it
231 switch(vp->v_state) {
232 case VS_ACTIVE:
233 TAILQ_REMOVE(&vi->active_list, vp, v_list);
234 atomic_add_int(&mycpu->gd_activevnodes, -1);
235 break;
236 case VS_INACTIVE:
237 spin_unlock(&vi->spin);
238 panic("_vinactive: already inactive");
239 /* NOT REACHED */
240 return;
241 case VS_CACHED:
242 case VS_DYING:
243 break;
247 * Distinguish between basically dead vnodes, vnodes with cached
248 * data, and vnodes without cached data. A rover will shift the
249 * vnodes around as their cache status is lost.
251 if (vp->v_flag & VRECLAIMED) {
252 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
253 } else {
254 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
256 vp->v_state = VS_INACTIVE;
257 spin_unlock(&vi->spin);
258 atomic_add_int(&mycpu->gd_inactivevnodes, 1);
262 * Add a ref to an active vnode. This function should never be called
263 * with an inactive vnode (use vget() instead), but might be called
264 * with other states.
266 void
267 vref(struct vnode *vp)
269 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
270 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
271 atomic_add_int(&vp->v_refcnt, 1);
274 void
275 vref_special(struct vnode *vp)
277 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
278 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
281 void
282 synchronizevnodecount(void)
284 int nca = 0;
285 int act = 0;
286 int ina = 0;
287 int i;
289 for (i = 0; i < ncpus; ++i) {
290 globaldata_t gd = globaldata_find(i);
291 nca += gd->gd_cachedvnodes;
292 act += gd->gd_activevnodes;
293 ina += gd->gd_inactivevnodes;
295 cachedvnodes = nca;
296 activevnodes = act;
297 inactivevnodes = ina;
301 * Count number of cached vnodes. This is middling expensive so be
302 * careful not to make this call in the critical path. Each cpu tracks
303 * its own accumulator. The individual accumulators must be summed
304 * together to get an accurate value.
307 countcachedvnodes(void)
309 int i;
310 int n = 0;
312 for (i = 0; i < ncpus; ++i) {
313 globaldata_t gd = globaldata_find(i);
314 n += gd->gd_cachedvnodes;
316 return n;
320 countcachedandinactivevnodes(void)
322 int i;
323 int n = 0;
325 for (i = 0; i < ncpus; ++i) {
326 globaldata_t gd = globaldata_find(i);
327 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
329 return n;
333 * Release a ref on an active or inactive vnode.
335 * Caller has no other requirements.
337 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
338 * transition, otherwise we leave the vnode in the active list and
339 * do a lockless transition to 0, which is very important for the
340 * critical path.
342 * (vrele() is not called when a vnode is being destroyed w/kfree)
344 void
345 vrele(struct vnode *vp)
347 int count;
349 #if 1
350 count = vp->v_refcnt;
351 cpu_ccfence();
353 for (;;) {
354 KKASSERT((count & VREF_MASK) > 0);
355 KKASSERT(vp->v_state == VS_ACTIVE ||
356 vp->v_state == VS_INACTIVE);
359 * 2+ case
361 if ((count & VREF_MASK) > 1) {
362 if (atomic_fcmpset_int(&vp->v_refcnt,
363 &count, count - 1)) {
364 break;
366 continue;
370 * 1->0 transition case must handle possible finalization.
371 * When finalizing we transition 1->0x40000000. Note that
372 * cachedvnodes is only adjusted on transitions to ->0.
374 * WARNING! VREF_TERMINATE can be cleared at any point
375 * when the refcnt is non-zero (by vget()) and
376 * the vnode has not been reclaimed. Thus
377 * transitions out of VREF_TERMINATE do not have
378 * to mess with cachedvnodes.
380 if (count & VREF_FINALIZE) {
381 vx_lock(vp);
382 if (atomic_fcmpset_int(&vp->v_refcnt,
383 &count, VREF_TERMINATE)) {
384 vnode_terminate(vp);
385 break;
387 vx_unlock(vp);
388 } else {
389 if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) {
390 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
391 break;
394 cpu_pause();
395 /* retry */
397 #else
399 * XXX NOT YET WORKING! Multiple threads can reference the vnode
400 * after dropping their count, racing destruction, because this
401 * code is not directly transitioning from 1->VREF_FINALIZE.
404 * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE
405 * and attempt to acquire VREF_TERMINATE if set. It is possible for
406 * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but
407 * only one will be able to transition the vnode into the
408 * VREF_TERMINATE state.
410 * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter
411 * this state once.
413 count = atomic_fetchadd_int(&vp->v_refcnt, -1);
414 if ((count & VREF_MASK) == 1) {
415 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
416 --count;
417 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) {
418 vx_lock(vp);
419 if (atomic_fcmpset_int(&vp->v_refcnt,
420 &count, VREF_TERMINATE)) {
421 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
422 vnode_terminate(vp);
423 break;
425 vx_unlock(vp);
428 #endif
432 * Add an auxiliary data structure reference to the vnode. Auxiliary
433 * references do not change the state of the vnode or prevent deactivation
434 * or reclamation of the vnode, but will prevent the vnode from being
435 * destroyed (kfree()'d).
437 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not
438 * already be held by the caller. vdrop() will clean up the
439 * free list state.
441 void
442 vhold(struct vnode *vp)
444 atomic_add_int(&vp->v_auxrefs, 1);
448 * Remove an auxiliary reference from the vnode.
450 void
451 vdrop(struct vnode *vp)
453 atomic_add_int(&vp->v_auxrefs, -1);
457 * Set VREF_FINALIZE to request that the vnode be inactivated
458 * as soon as possible (on the 1->0 transition of its refs).
460 * Caller must have a ref on the vnode.
462 * This function has no effect if the vnode is already in termination
463 * processing.
465 void
466 vfinalize(struct vnode *vp)
468 if ((vp->v_refcnt & VREF_MASK) > 0)
469 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
473 * This function is called on the 1->0 transition (which is actually
474 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
475 * of the vnode.
477 * Additional vrefs are allowed to race but will not result in a reentrant
478 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This
479 * prevents additional 1->0 transitions.
481 * ONLY A VGET() CAN REACTIVATE THE VNODE.
483 * Caller must hold the VX lock.
485 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
487 * NOTE: The vnode may be marked inactive with dirty buffers
488 * or dirty pages in its cached VM object still present.
490 * NOTE: VS_FREE should not be set on entry (the vnode was expected to
491 * previously be active). We lose control of the vnode the instant
492 * it is placed on the free list.
494 * The VX lock is required when transitioning to VS_CACHED but is
495 * not sufficient for the vshouldfree() interlocked test or when
496 * transitioning away from VS_CACHED. v_spin is also required for
497 * those cases.
499 static
500 void
501 vnode_terminate(struct vnode *vp)
503 KKASSERT(vp->v_state == VS_ACTIVE);
505 if ((vp->v_flag & VINACTIVE) == 0) {
506 _vsetflags(vp, VINACTIVE);
507 if (vp->v_mount)
508 VOP_INACTIVE(vp);
510 spin_lock(&vp->v_spin);
511 _vinactive(vp);
512 spin_unlock(&vp->v_spin);
514 vx_unlock(vp);
517 /****************************************************************
518 * VX LOCKING FUNCTIONS *
519 ****************************************************************
521 * These functions lock vnodes for reclamation and deactivation related
522 * activities. The caller must already be holding some sort of reference
523 * on the vnode.
525 void
526 vx_lock(struct vnode *vp)
528 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
529 spin_lock_update_only(&vp->v_spin);
532 void
533 vx_unlock(struct vnode *vp)
535 spin_unlock_update_only(&vp->v_spin);
536 lockmgr(&vp->v_lock, LK_RELEASE);
540 * Downgrades a VX lock to a normal VN lock. The lock remains EXCLUSIVE.
542 * Generally required after calling getnewvnode() if the intention is
543 * to return a normal locked vnode to the caller.
545 void
546 vx_downgrade(struct vnode *vp)
548 spin_unlock_update_only(&vp->v_spin);
551 /****************************************************************
552 * VNODE ACQUISITION FUNCTIONS *
553 ****************************************************************
555 * These functions must be used when accessing a vnode that has no
556 * chance of being destroyed in a SMP race. That means the caller will
557 * usually either hold an auxiliary reference (such as the namecache)
558 * or hold some other lock that ensures that the vnode cannot be destroyed.
560 * These functions are MANDATORY for any code chain accessing a vnode
561 * whos activation state is not known.
563 * vget() can be called with LK_NOWAIT and will return EBUSY if the
564 * lock cannot be immediately acquired.
566 * vget()/vput() are used when reactivation is desired.
568 * vx_get() and vx_put() are used when reactivation is not desired.
571 vget(struct vnode *vp, int flags)
573 int error;
576 * A lock type must be passed
578 if ((flags & LK_TYPE_MASK) == 0) {
579 panic("vget() called with no lock specified!");
580 /* NOT REACHED */
584 * Reference the structure and then acquire the lock.
586 * NOTE: The requested lock might be a shared lock and does
587 * not protect our access to the refcnt or other fields.
589 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
590 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
592 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
594 * The lock failed, undo and return an error. This will not
595 * normally trigger a termination.
597 vrele(vp);
598 } else if (vp->v_flag & VRECLAIMED) {
600 * The node is being reclaimed and cannot be reactivated
601 * any more, undo and return ENOENT.
603 vn_unlock(vp);
604 vrele(vp);
605 error = ENOENT;
606 } else if (vp->v_state == VS_ACTIVE) {
608 * A VS_ACTIVE vnode coupled with the fact that we have
609 * a vnode lock (even if shared) prevents v_state from
610 * changing. Since the vnode is not in a VRECLAIMED state,
611 * we can safely clear VINACTIVE.
613 * It is possible for a shared lock to cause a race with
614 * another thread that is also in the process of clearing
615 * VREF_TERMINATE, meaning that we might return with it still
616 * set and then assert in a later vref(). The solution is to
617 * unconditionally clear VREF_TERMINATE here as well.
619 * NOTE! Multiple threads may clear VINACTIVE if this is
620 * shared lock. This race is allowed.
622 if (vp->v_flag & VINACTIVE)
623 _vclrflags(vp, VINACTIVE); /* SMP race ok */
624 if (vp->v_act < VACT_MAX) {
625 vp->v_act += VACT_INC;
626 if (vp->v_act > VACT_MAX) /* SMP race ok */
627 vp->v_act = VACT_MAX;
629 error = 0;
630 if (vp->v_refcnt & VREF_TERMINATE) /* SMP race ok */
631 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
632 } else {
634 * If the vnode is not VS_ACTIVE it must be reactivated
635 * in addition to clearing VINACTIVE. An exclusive spin_lock
636 * is needed to manipulate the vnode's list.
638 * Because the lockmgr lock might be shared, we might race
639 * another reactivation, which we handle. In this situation,
640 * however, the refcnt prevents other v_state races.
642 * As with above, clearing VINACTIVE is allowed to race other
643 * clearings of VINACTIVE.
645 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
646 * the refcnt is non-zero and the vnode has not been
647 * reclaimed. This also means that the transitions do
648 * not affect cachedvnodes.
650 * It is possible for a shared lock to cause a race with
651 * another thread that is also in the process of clearing
652 * VREF_TERMINATE, meaning that we might return with it still
653 * set and then assert in a later vref(). The solution is to
654 * unconditionally clear VREF_TERMINATE here as well.
656 _vclrflags(vp, VINACTIVE);
657 vp->v_act += VACT_INC;
658 if (vp->v_act > VACT_MAX) /* SMP race ok */
659 vp->v_act = VACT_MAX;
660 spin_lock(&vp->v_spin);
662 switch(vp->v_state) {
663 case VS_INACTIVE:
664 _vactivate(vp);
665 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
666 VREF_FINALIZE);
667 spin_unlock(&vp->v_spin);
668 break;
669 case VS_CACHED:
670 _vactivate(vp);
671 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
672 VREF_FINALIZE);
673 spin_unlock(&vp->v_spin);
674 break;
675 case VS_ACTIVE:
676 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
677 VREF_TERMINATE);
678 spin_unlock(&vp->v_spin);
679 break;
680 case VS_DYING:
681 spin_unlock(&vp->v_spin);
682 panic("Impossible VS_DYING state");
683 break;
685 error = 0;
687 return(error);
690 #ifdef DEBUG_VPUT
692 void
693 debug_vput(struct vnode *vp, const char *filename, int line)
695 kprintf("vput(%p) %s:%d\n", vp, filename, line);
696 vn_unlock(vp);
697 vrele(vp);
700 #else
702 void
703 vput(struct vnode *vp)
705 vn_unlock(vp);
706 vrele(vp);
709 #endif
712 * Acquire the vnode lock unguarded.
714 * The non-blocking version also uses a slightly different mechanic.
715 * This function will explicitly fail not only if it cannot acquire
716 * the lock normally, but also if the caller already holds a lock.
718 * The adjusted mechanic is used to close a loophole where complex
719 * VOP_RECLAIM code can circle around recursively and allocate the
720 * same vnode it is trying to destroy from the freelist.
722 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
723 * cause the incorrect behavior to occur. If not for that lockmgr()
724 * would do the right thing.
726 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
728 void
729 vx_get(struct vnode *vp)
731 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
732 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
733 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
734 spin_lock_update_only(&vp->v_spin);
738 vx_get_nonblock(struct vnode *vp)
740 int error;
742 if (lockinuse(&vp->v_lock))
743 return(EBUSY);
744 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
745 if (error == 0) {
746 spin_lock_update_only(&vp->v_spin);
747 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
748 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
750 return(error);
754 * Release a VX lock that also held a ref on the vnode. vrele() will handle
755 * any needed state transitions.
757 * However, filesystems use this function to get rid of unwanted new vnodes
758 * so try to get the vnode on the correct queue in that case.
760 void
761 vx_put(struct vnode *vp)
763 if (vp->v_type == VNON || vp->v_type == VBAD)
764 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
765 spin_unlock_update_only(&vp->v_spin);
766 lockmgr(&vp->v_lock, LK_RELEASE);
767 vrele(vp);
771 * Try to reuse a vnode from the free list. This function is somewhat
772 * advisory in that NULL can be returned as a normal case, even if free
773 * vnodes are present.
775 * The scan is limited because it can result in excessive CPU use during
776 * periods of extreme vnode use.
778 * NOTE: The returned vnode is not completely initialized.
779 * The returned vnode will be VX locked.
781 static
782 struct vnode *
783 cleanfreevnode(int maxcount)
785 struct vnode_index *vi;
786 struct vnode *vp;
787 int count;
788 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
789 int ri;
790 int cpu_count;
791 int cachedvnodes;
794 * Try to deactivate some vnodes cached on the active list. We
795 * generally want a 50-50 balance active vs inactive.
797 cachedvnodes = countcachedvnodes();
798 if (cachedvnodes < inactivevnodes)
799 goto skip;
801 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
803 for (count = 0; count < maxcount * 2; ++count, ++ri) {
804 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
806 spin_lock(&vi->spin);
808 vp = TAILQ_NEXT(&vi->active_rover, v_list);
809 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
810 if (vp == NULL) {
811 TAILQ_INSERT_HEAD(&vi->active_list,
812 &vi->active_rover, v_list);
813 } else {
814 TAILQ_INSERT_AFTER(&vi->active_list, vp,
815 &vi->active_rover, v_list);
817 if (vp == NULL) {
818 spin_unlock(&vi->spin);
819 continue;
823 * Don't try to deactivate if someone has the vp referenced.
825 if ((vp->v_refcnt & VREF_MASK) != 0) {
826 spin_unlock(&vi->spin);
827 vp->v_act += VACT_INC;
828 if (vp->v_act > VACT_MAX) /* SMP race ok */
829 vp->v_act = VACT_MAX;
830 continue;
834 * Calculate the deactivation weight. Reduce v_act less
835 * if the vnode's object has a lot of VM pages.
837 * XXX obj race
839 if (vp->v_act > 0) {
840 vm_object_t obj;
842 if ((obj = vp->v_object) != NULL &&
843 obj->resident_page_count >= trigger)
845 vp->v_act -= 1;
846 } else {
847 vp->v_act -= VACT_INC;
849 if (vp->v_act < 0)
850 vp->v_act = 0;
851 spin_unlock(&vi->spin);
852 continue;
856 * If v_auxrefs is not the expected value the vnode might
857 * reside in the namecache topology on an internal node and
858 * not at a leaf. v_auxrefs can be wrong for other reasons,
859 * but this is the most likely.
861 * Such vnodes will not be recycled by vnlru later on in
862 * its inactive scan, so try to make the vnode presentable
863 * and only move it to the inactive queue if we can.
865 * On success, the vnode is disconnected from the namecache
866 * topology entirely, making vnodes above it in the topology
867 * recycleable. This will allow the active scan to continue
868 * to make progress in balancing the active and inactive
869 * lists.
871 if (vp->v_auxrefs != vp->v_namecache_count) {
872 if (vx_get_nonblock(vp) == 0) {
873 spin_unlock(&vi->spin);
874 if ((vp->v_refcnt & VREF_MASK) == 1)
875 cache_inval_vp_quick(vp);
876 if (vp->v_auxrefs == vp->v_namecache_count)
877 ++auxrecovervnodes1;
878 vx_put(vp);
879 } else {
880 spin_unlock(&vi->spin);
882 continue;
886 * Try to deactivate the vnode. It is ok if v_auxrefs
887 * races every once in a while, we just don't want an
888 * excess of unreclaimable vnodes on the inactive list.
890 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
891 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
892 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
894 spin_unlock(&vi->spin);
895 vrele(vp);
898 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
900 skip:
902 * Loop trying to lock the first vnode on the free list.
903 * Cycle if we can't.
905 cpu_count = ncpus;
906 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
908 for (count = 0; count < maxcount; ++count, ++ri) {
909 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
911 spin_lock(&vi->spin);
913 vp = TAILQ_FIRST(&vi->inactive_list);
914 if (vp == NULL) {
915 spin_unlock(&vi->spin);
916 if (--cpu_count == 0)
917 break;
918 ri = (ri + 16) & ~15;
919 --ri;
920 continue;
924 * non-blocking vx_get will also ref the vnode on success.
926 if (vx_get_nonblock(vp)) {
927 KKASSERT(vp->v_state == VS_INACTIVE);
928 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
929 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
930 spin_unlock(&vi->spin);
931 continue;
935 * Because we are holding vfs_spin the vnode should currently
936 * be inactive and VREF_TERMINATE should still be set.
938 * Once vfs_spin is released the vnode's state should remain
939 * unmodified due to both the lock and ref on it.
941 KKASSERT(vp->v_state == VS_INACTIVE);
942 spin_unlock(&vi->spin);
943 #ifdef TRACKVNODE
944 if ((u_long)vp == trackvnode)
945 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
946 #endif
949 * The active scan already did this, but some leakage can
950 * happen. Don't let an easily recycleable vnode go to
951 * waste!
953 if (vp->v_auxrefs != vp->v_namecache_count &&
954 (vp->v_refcnt & ~VREF_FINALIZE) == VREF_TERMINATE + 1)
956 cache_inval_vp_quick(vp);
957 if (vp->v_auxrefs == vp->v_namecache_count)
958 ++auxrecovervnodes2;
962 * Do not reclaim/reuse a vnode while auxillary refs exists.
963 * This includes namecache refs due to a related ncp being
964 * locked or having children, a VM object association, or
965 * other hold users.
967 * Do not reclaim/reuse a vnode if someone else has a real
968 * ref on it. This can occur if a filesystem temporarily
969 * releases the vnode lock during VOP_RECLAIM.
971 if (vp->v_auxrefs != vp->v_namecache_count ||
972 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
973 failed:
974 if (vp->v_state == VS_INACTIVE) {
975 spin_lock(&vi->spin);
976 if (vp->v_state == VS_INACTIVE) {
977 TAILQ_REMOVE(&vi->inactive_list,
978 vp, v_list);
979 TAILQ_INSERT_TAIL(&vi->inactive_list,
980 vp, v_list);
982 spin_unlock(&vi->spin);
984 vx_put(vp);
985 continue;
989 * VINACTIVE and VREF_TERMINATE are expected to both be set
990 * for vnodes pulled from the inactive list, and cannot be
991 * changed while we hold the vx lock.
993 * Try to reclaim the vnode.
995 * The cache_inval_vp() can fail if any of the namecache
996 * elements are actively locked, preventing the vnode from
997 * bring reclaimed. This is desired operation as it gives
998 * the namecache code certain guarantees just by holding
999 * a ncp.
1001 KKASSERT(vp->v_flag & VINACTIVE);
1002 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1004 if ((vp->v_flag & VRECLAIMED) == 0) {
1005 if (cache_inval_vp_nonblock(vp))
1006 goto failed;
1007 vgone_vxlocked(vp);
1008 /* vnode is still VX locked */
1012 * At this point if there are no other refs or auxrefs on
1013 * the vnode with the inactive list locked, and we remove
1014 * the vnode from the inactive list, it should not be
1015 * possible for anyone else to access the vnode any more.
1017 * Since the vnode is in a VRECLAIMED state, no new
1018 * namecache associations could have been made and the
1019 * vnode should have already been removed from its mountlist.
1021 * Since we hold a VX lock on the vnode it cannot have been
1022 * reactivated (moved out of the inactive list).
1024 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1025 spin_lock(&vi->spin);
1026 if (vp->v_auxrefs ||
1027 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1028 spin_unlock(&vi->spin);
1029 goto failed;
1031 KKASSERT(vp->v_state == VS_INACTIVE);
1032 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1033 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1034 vp->v_state = VS_DYING;
1035 spin_unlock(&vi->spin);
1038 * Nothing should have been able to access this vp. Only
1039 * our ref should remain now.
1041 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1042 KASSERT(vp->v_refcnt == 1,
1043 ("vp %p badrefs %08x", vp, vp->v_refcnt));
1046 * Return a VX locked vnode suitable for reuse.
1048 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
1049 return(vp);
1051 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
1052 return(NULL);
1056 * Obtain a new vnode. The returned vnode is VX locked & vrefd.
1058 * All new vnodes set the VAGE flags. An open() of the vnode will
1059 * decrement the (2-bit) flags. Vnodes which are opened several times
1060 * are thus retained in the cache over vnodes which are merely stat()d.
1062 * We attempt to reuse an already-recycled vnode from our pcpu inactive
1063 * queue first, and allocate otherwise. Attempting to recycle inactive
1064 * vnodes here can lead to numerous deadlocks, particularly with
1065 * softupdates.
1067 struct vnode *
1068 allocvnode(int lktimeout, int lkflags)
1070 struct vnode *vp;
1071 struct vnode_index *vi;
1074 * lktimeout only applies when LK_TIMELOCK is used, and only
1075 * the pageout daemon uses it. The timeout may not be zero
1076 * or the pageout daemon can deadlock in low-VM situations.
1078 if (lktimeout == 0)
1079 lktimeout = hz / 10;
1082 * Do not flag for synchronous recyclement unless there are enough
1083 * freeable vnodes to recycle and the number of vnodes has
1084 * significantly exceeded our target. We want the normal vnlru
1085 * process to handle the cleaning (at 9/10's) before we are forced
1086 * to flag it here at 11/10's for userexit path processing.
1088 if (numvnodes >= maxvnodes * 11 / 10 &&
1089 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
1090 struct thread *td = curthread;
1091 if (td->td_lwp)
1092 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
1096 * Try to trivially reuse a reclaimed vnode from the head of the
1097 * inactive list for this cpu. Any vnode cycling which occurs
1098 * which terminates the vnode will cause it to be returned to the
1099 * same pcpu structure (e.g. unlink calls).
1101 vi = &vnode_list_hash[mycpuid];
1102 spin_lock(&vi->spin);
1104 vp = TAILQ_FIRST(&vi->inactive_list);
1105 if (vp && (vp->v_flag & VRECLAIMED)) {
1107 * non-blocking vx_get will also ref the vnode on success.
1109 if (vx_get_nonblock(vp)) {
1110 KKASSERT(vp->v_state == VS_INACTIVE);
1111 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1112 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
1113 spin_unlock(&vi->spin);
1114 goto slower;
1118 * Because we are holding vfs_spin the vnode should currently
1119 * be inactive and VREF_TERMINATE should still be set.
1121 * Once vfs_spin is released the vnode's state should remain
1122 * unmodified due to both the lock and ref on it.
1124 KKASSERT(vp->v_state == VS_INACTIVE);
1125 #ifdef TRACKVNODE
1126 if ((u_long)vp == trackvnode)
1127 kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1128 #endif
1131 * Do not reclaim/reuse a vnode while auxillary refs exists.
1132 * This includes namecache refs due to a related ncp being
1133 * locked or having children, a VM object association, or
1134 * other hold users.
1136 * Do not reclaim/reuse a vnode if someone else has a real
1137 * ref on it. This can occur if a filesystem temporarily
1138 * releases the vnode lock during VOP_RECLAIM.
1140 if (vp->v_auxrefs ||
1141 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1142 if (vp->v_state == VS_INACTIVE) {
1143 TAILQ_REMOVE(&vi->inactive_list,
1144 vp, v_list);
1145 TAILQ_INSERT_TAIL(&vi->inactive_list,
1146 vp, v_list);
1148 spin_unlock(&vi->spin);
1149 vx_put(vp);
1150 goto slower;
1154 * VINACTIVE and VREF_TERMINATE are expected to both be set
1155 * for vnodes pulled from the inactive list, and cannot be
1156 * changed while we hold the vx lock.
1158 * Try to reclaim the vnode.
1160 KKASSERT(vp->v_flag & VINACTIVE);
1161 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1163 if ((vp->v_flag & VRECLAIMED) == 0) {
1164 spin_unlock(&vi->spin);
1165 vx_put(vp);
1166 goto slower;
1170 * At this point if there are no other refs or auxrefs on
1171 * the vnode with the inactive list locked, and we remove
1172 * the vnode from the inactive list, it should not be
1173 * possible for anyone else to access the vnode any more.
1175 * Since the vnode is in a VRECLAIMED state, no new
1176 * namecache associations could have been made and the
1177 * vnode should have already been removed from its mountlist.
1179 * Since we hold a VX lock on the vnode it cannot have been
1180 * reactivated (moved out of the inactive list).
1182 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1183 KKASSERT(vp->v_state == VS_INACTIVE);
1184 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1185 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1186 vp->v_state = VS_DYING;
1187 spin_unlock(&vi->spin);
1190 * Nothing should have been able to access this vp. Only
1191 * our ref should remain now.
1193 * At this point we can kfree() the vnode if we want to.
1194 * Instead, we reuse it for the allocation.
1196 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1197 KASSERT(vp->v_refcnt == 1,
1198 ("vp %p badrefs %08x", vp, vp->v_refcnt));
1199 vx_unlock(vp); /* safety: keep the API clean */
1200 bzero(vp, sizeof(*vp));
1201 } else {
1202 spin_unlock(&vi->spin);
1203 slower:
1204 vp = kmalloc_obj(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1205 atomic_add_int(&numvnodes, 1);
1208 lwkt_token_init(&vp->v_token, "vnode");
1209 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1210 TAILQ_INIT(&vp->v_namecache);
1211 RB_INIT(&vp->v_rbclean_tree);
1212 RB_INIT(&vp->v_rbdirty_tree);
1213 RB_INIT(&vp->v_rbhash_tree);
1214 spin_init(&vp->v_spin, "allocvnode");
1216 vx_lock(vp);
1217 vp->v_refcnt = 1;
1218 vp->v_flag = VAGE0 | VAGE1;
1219 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1221 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1222 /* exclusive lock still held */
1224 vp->v_filesize = NOOFFSET;
1225 vp->v_type = VNON;
1226 vp->v_tag = 0;
1227 vp->v_state = VS_CACHED;
1228 _vactivate(vp);
1230 return (vp);
1234 * Called after a process has allocated a vnode via allocvnode()
1235 * and we detected that too many vnodes were present.
1237 * This function is called just prior to a return to userland if the
1238 * process at some point had to allocate a new vnode during the last
1239 * system call and the vnode count was found to be excessive.
1241 * This is a synchronous path that we do not normally want to execute.
1243 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1245 * WARNING: Sometimes numvnodes can blow out due to children being
1246 * present under directory vnodes in the namecache. For the
1247 * moment use an if() instead of a while() and note that if
1248 * we were to use a while() we would still have to break out
1249 * if freesomevnodes() returned 0. vnlru will also be trying
1250 * hard to free vnodes at the same time (with a lower trigger
1251 * pointer).
1253 void
1254 allocvnode_gc(void)
1256 if (numvnodes >= maxvnodes &&
1257 countcachedandinactivevnodes() >= maxvnodes * 5 / 10)
1259 freesomevnodes(batchfreevnodes);
1264 freesomevnodes(int n)
1266 struct vnode *vp;
1267 int count = 0;
1269 while (n) {
1270 if ((vp = cleanfreevnode(n)) == NULL)
1271 break;
1272 vx_unlock(vp);
1273 --n;
1274 ++count;
1275 kfree_obj(vp, M_VNODE);
1276 atomic_add_int(&numvnodes, -1);
1278 return(count);