2 * Copyright (c) 2004,2013-2022 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * External lock/ref-related vnode functions
38 * vs_state transition locking requirements:
40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin
41 * DYING -> CACHED vx_lock(excl)
42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin
43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin
44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin
46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
48 * Switching into ACTIVE also requires a vref and vnode lock, however
49 * the vnode lock is allowed to be SHARED.
51 * Switching into a CACHED or DYING state requires an exclusive vnode
52 * lock or vx_lock (which is almost the same thing but not quite).
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
61 #include <sys/vnode.h>
62 #include <sys/spinlock2.h>
63 #include <sys/sysctl.h>
65 #include <machine/limits.h>
68 #include <vm/vm_object.h>
73 static void vnode_terminate(struct vnode
*vp
);
75 static MALLOC_DEFINE_OBJ(M_VNODE
, sizeof(struct vnode
), "vnodes", "vnodes");
76 static MALLOC_DEFINE(M_VNODE_HASH
, "vnodelsthash", "vnode list hash");
79 * The vnode free list hold inactive vnodes. Aged inactive vnodes
80 * are inserted prior to the mid point, and otherwise inserted
83 * The vnode code goes to great lengths to avoid moving vnodes between
84 * lists, but sometimes it is unavoidable. For this situation we try to
85 * avoid lock contention but we do not try very hard to avoid cache line
86 * congestion. A modestly sized hash table is used.
88 #define VLIST_PRIME2 123462047LU
89 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU
91 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \
92 VLIST_PRIME2 % (unsigned)ncpus)
94 static struct vnode_index
*vnode_list_hash
;
97 SYSCTL_INT(_debug
, OID_AUTO
, activevnodes
, CTLFLAG_RD
,
98 &activevnodes
, 0, "Number of active nodes");
100 SYSCTL_INT(_debug
, OID_AUTO
, cachedvnodes
, CTLFLAG_RD
,
101 &cachedvnodes
, 0, "Number of total cached nodes");
102 int inactivevnodes
= 0;
103 SYSCTL_INT(_debug
, OID_AUTO
, inactivevnodes
, CTLFLAG_RD
,
104 &inactivevnodes
, 0, "Number of inactive nodes");
105 static int batchfreevnodes
= 5;
106 SYSCTL_INT(_debug
, OID_AUTO
, batchfreevnodes
, CTLFLAG_RW
,
107 &batchfreevnodes
, 0, "Number of vnodes to free at once");
109 static long auxrecovervnodes1
;
110 SYSCTL_INT(_debug
, OID_AUTO
, auxrecovervnodes1
, CTLFLAG_RW
,
111 &auxrecovervnodes1
, 0, "vnlru auxillary vnodes recovered");
112 static long auxrecovervnodes2
;
113 SYSCTL_INT(_debug
, OID_AUTO
, auxrecovervnodes2
, CTLFLAG_RW
,
114 &auxrecovervnodes2
, 0, "vnlru auxillary vnodes recovered");
117 static u_long trackvnode
;
118 SYSCTL_ULONG(_debug
, OID_AUTO
, trackvnode
, CTLFLAG_RW
,
123 * Called from vfsinit()
130 kmalloc_obj_raise_limit(M_VNODE
, 0); /* unlimited */
131 vnode_list_hash
= kmalloc(sizeof(*vnode_list_hash
) * ncpus
,
132 M_VNODE_HASH
, M_ZERO
| M_WAITOK
);
133 for (i
= 0; i
< ncpus
; ++i
) {
134 struct vnode_index
*vi
= &vnode_list_hash
[i
];
136 TAILQ_INIT(&vi
->inactive_list
);
137 TAILQ_INIT(&vi
->active_list
);
138 TAILQ_INSERT_TAIL(&vi
->active_list
, &vi
->active_rover
, v_list
);
139 spin_init(&vi
->spin
, "vfslock");
148 _vsetflags(struct vnode
*vp
, int flags
)
150 atomic_set_int(&vp
->v_flag
, flags
);
155 _vclrflags(struct vnode
*vp
, int flags
)
157 atomic_clear_int(&vp
->v_flag
, flags
);
161 vsetflags(struct vnode
*vp
, int flags
)
163 _vsetflags(vp
, flags
);
167 vclrflags(struct vnode
*vp
, int flags
)
169 _vclrflags(vp
, flags
);
173 * Place the vnode on the active list.
175 * Caller must hold vp->v_spin
179 _vactivate(struct vnode
*vp
)
181 struct vnode_index
*vi
= &vnode_list_hash
[VLIST_HASH(vp
)];
184 if ((u_long
)vp
== trackvnode
)
185 kprintf("_vactivate %p %08x\n", vp
, vp
->v_flag
);
187 spin_lock(&vi
->spin
);
189 switch(vp
->v_state
) {
191 spin_unlock(&vi
->spin
);
192 panic("_vactivate: already active");
196 TAILQ_REMOVE(&vi
->inactive_list
, vp
, v_list
);
197 atomic_add_int(&mycpu
->gd_inactivevnodes
, -1);
203 TAILQ_INSERT_TAIL(&vi
->active_list
, vp
, v_list
);
204 vp
->v_state
= VS_ACTIVE
;
205 spin_unlock(&vi
->spin
);
206 atomic_add_int(&mycpu
->gd_activevnodes
, 1);
210 * Put a vnode on the inactive list.
212 * Caller must hold v_spin
216 _vinactive(struct vnode
*vp
)
218 struct vnode_index
*vi
= &vnode_list_hash
[VLIST_HASH(vp
)];
221 if ((u_long
)vp
== trackvnode
) {
222 kprintf("_vinactive %p %08x\n", vp
, vp
->v_flag
);
226 spin_lock(&vi
->spin
);
229 * Remove from active list if it is sitting on it
231 switch(vp
->v_state
) {
233 TAILQ_REMOVE(&vi
->active_list
, vp
, v_list
);
234 atomic_add_int(&mycpu
->gd_activevnodes
, -1);
237 spin_unlock(&vi
->spin
);
238 panic("_vinactive: already inactive");
247 * Distinguish between basically dead vnodes, vnodes with cached
248 * data, and vnodes without cached data. A rover will shift the
249 * vnodes around as their cache status is lost.
251 if (vp
->v_flag
& VRECLAIMED
) {
252 TAILQ_INSERT_HEAD(&vi
->inactive_list
, vp
, v_list
);
254 TAILQ_INSERT_TAIL(&vi
->inactive_list
, vp
, v_list
);
256 vp
->v_state
= VS_INACTIVE
;
257 spin_unlock(&vi
->spin
);
258 atomic_add_int(&mycpu
->gd_inactivevnodes
, 1);
262 * Add a ref to an active vnode. This function should never be called
263 * with an inactive vnode (use vget() instead), but might be called
267 vref(struct vnode
*vp
)
269 KASSERT((VREFCNT(vp
) > 0 && vp
->v_state
!= VS_INACTIVE
),
270 ("vref: bad refcnt %08x %d", vp
->v_refcnt
, vp
->v_state
));
271 atomic_add_int(&vp
->v_refcnt
, 1);
275 vref_special(struct vnode
*vp
)
277 if ((atomic_fetchadd_int(&vp
->v_refcnt
, 1) & VREF_MASK
) == 0)
278 atomic_add_int(&mycpu
->gd_cachedvnodes
, -1);
282 synchronizevnodecount(void)
289 for (i
= 0; i
< ncpus
; ++i
) {
290 globaldata_t gd
= globaldata_find(i
);
291 nca
+= gd
->gd_cachedvnodes
;
292 act
+= gd
->gd_activevnodes
;
293 ina
+= gd
->gd_inactivevnodes
;
297 inactivevnodes
= ina
;
301 * Count number of cached vnodes. This is middling expensive so be
302 * careful not to make this call in the critical path. Each cpu tracks
303 * its own accumulator. The individual accumulators must be summed
304 * together to get an accurate value.
307 countcachedvnodes(void)
312 for (i
= 0; i
< ncpus
; ++i
) {
313 globaldata_t gd
= globaldata_find(i
);
314 n
+= gd
->gd_cachedvnodes
;
320 countcachedandinactivevnodes(void)
325 for (i
= 0; i
< ncpus
; ++i
) {
326 globaldata_t gd
= globaldata_find(i
);
327 n
+= gd
->gd_cachedvnodes
+ gd
->gd_inactivevnodes
;
333 * Release a ref on an active or inactive vnode.
335 * Caller has no other requirements.
337 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
338 * transition, otherwise we leave the vnode in the active list and
339 * do a lockless transition to 0, which is very important for the
342 * (vrele() is not called when a vnode is being destroyed w/kfree)
345 vrele(struct vnode
*vp
)
350 count
= vp
->v_refcnt
;
354 KKASSERT((count
& VREF_MASK
) > 0);
355 KKASSERT(vp
->v_state
== VS_ACTIVE
||
356 vp
->v_state
== VS_INACTIVE
);
361 if ((count
& VREF_MASK
) > 1) {
362 if (atomic_fcmpset_int(&vp
->v_refcnt
,
363 &count
, count
- 1)) {
370 * 1->0 transition case must handle possible finalization.
371 * When finalizing we transition 1->0x40000000. Note that
372 * cachedvnodes is only adjusted on transitions to ->0.
374 * WARNING! VREF_TERMINATE can be cleared at any point
375 * when the refcnt is non-zero (by vget()) and
376 * the vnode has not been reclaimed. Thus
377 * transitions out of VREF_TERMINATE do not have
378 * to mess with cachedvnodes.
380 if (count
& VREF_FINALIZE
) {
382 if (atomic_fcmpset_int(&vp
->v_refcnt
,
383 &count
, VREF_TERMINATE
)) {
389 if (atomic_fcmpset_int(&vp
->v_refcnt
, &count
, 0)) {
390 atomic_add_int(&mycpu
->gd_cachedvnodes
, 1);
399 * XXX NOT YET WORKING! Multiple threads can reference the vnode
400 * after dropping their count, racing destruction, because this
401 * code is not directly transitioning from 1->VREF_FINALIZE.
404 * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE
405 * and attempt to acquire VREF_TERMINATE if set. It is possible for
406 * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but
407 * only one will be able to transition the vnode into the
408 * VREF_TERMINATE state.
410 * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter
413 count
= atomic_fetchadd_int(&vp
->v_refcnt
, -1);
414 if ((count
& VREF_MASK
) == 1) {
415 atomic_add_int(&mycpu
->gd_cachedvnodes
, 1);
417 while ((count
& (VREF_MASK
| VREF_FINALIZE
)) == VREF_FINALIZE
) {
419 if (atomic_fcmpset_int(&vp
->v_refcnt
,
420 &count
, VREF_TERMINATE
)) {
421 atomic_add_int(&mycpu
->gd_cachedvnodes
, -1);
432 * Add an auxiliary data structure reference to the vnode. Auxiliary
433 * references do not change the state of the vnode or prevent deactivation
434 * or reclamation of the vnode, but will prevent the vnode from being
435 * destroyed (kfree()'d).
437 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not
438 * already be held by the caller. vdrop() will clean up the
442 vhold(struct vnode
*vp
)
444 atomic_add_int(&vp
->v_auxrefs
, 1);
448 * Remove an auxiliary reference from the vnode.
451 vdrop(struct vnode
*vp
)
453 atomic_add_int(&vp
->v_auxrefs
, -1);
457 * Set VREF_FINALIZE to request that the vnode be inactivated
458 * as soon as possible (on the 1->0 transition of its refs).
460 * Caller must have a ref on the vnode.
462 * This function has no effect if the vnode is already in termination
466 vfinalize(struct vnode
*vp
)
468 if ((vp
->v_refcnt
& VREF_MASK
) > 0)
469 atomic_set_int(&vp
->v_refcnt
, VREF_FINALIZE
);
473 * This function is called on the 1->0 transition (which is actually
474 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
477 * Additional vrefs are allowed to race but will not result in a reentrant
478 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This
479 * prevents additional 1->0 transitions.
481 * ONLY A VGET() CAN REACTIVATE THE VNODE.
483 * Caller must hold the VX lock.
485 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
487 * NOTE: The vnode may be marked inactive with dirty buffers
488 * or dirty pages in its cached VM object still present.
490 * NOTE: VS_FREE should not be set on entry (the vnode was expected to
491 * previously be active). We lose control of the vnode the instant
492 * it is placed on the free list.
494 * The VX lock is required when transitioning to VS_CACHED but is
495 * not sufficient for the vshouldfree() interlocked test or when
496 * transitioning away from VS_CACHED. v_spin is also required for
501 vnode_terminate(struct vnode
*vp
)
503 KKASSERT(vp
->v_state
== VS_ACTIVE
);
505 if ((vp
->v_flag
& VINACTIVE
) == 0) {
506 _vsetflags(vp
, VINACTIVE
);
510 spin_lock(&vp
->v_spin
);
512 spin_unlock(&vp
->v_spin
);
517 /****************************************************************
518 * VX LOCKING FUNCTIONS *
519 ****************************************************************
521 * These functions lock vnodes for reclamation and deactivation related
522 * activities. The caller must already be holding some sort of reference
526 vx_lock(struct vnode
*vp
)
528 lockmgr(&vp
->v_lock
, LK_EXCLUSIVE
);
529 spin_lock_update_only(&vp
->v_spin
);
533 vx_unlock(struct vnode
*vp
)
535 spin_unlock_update_only(&vp
->v_spin
);
536 lockmgr(&vp
->v_lock
, LK_RELEASE
);
540 * Downgrades a VX lock to a normal VN lock. The lock remains EXCLUSIVE.
542 * Generally required after calling getnewvnode() if the intention is
543 * to return a normal locked vnode to the caller.
546 vx_downgrade(struct vnode
*vp
)
548 spin_unlock_update_only(&vp
->v_spin
);
551 /****************************************************************
552 * VNODE ACQUISITION FUNCTIONS *
553 ****************************************************************
555 * These functions must be used when accessing a vnode that has no
556 * chance of being destroyed in a SMP race. That means the caller will
557 * usually either hold an auxiliary reference (such as the namecache)
558 * or hold some other lock that ensures that the vnode cannot be destroyed.
560 * These functions are MANDATORY for any code chain accessing a vnode
561 * whos activation state is not known.
563 * vget() can be called with LK_NOWAIT and will return EBUSY if the
564 * lock cannot be immediately acquired.
566 * vget()/vput() are used when reactivation is desired.
568 * vx_get() and vx_put() are used when reactivation is not desired.
571 vget(struct vnode
*vp
, int flags
)
576 * A lock type must be passed
578 if ((flags
& LK_TYPE_MASK
) == 0) {
579 panic("vget() called with no lock specified!");
584 * Reference the structure and then acquire the lock.
586 * NOTE: The requested lock might be a shared lock and does
587 * not protect our access to the refcnt or other fields.
589 if ((atomic_fetchadd_int(&vp
->v_refcnt
, 1) & VREF_MASK
) == 0)
590 atomic_add_int(&mycpu
->gd_cachedvnodes
, -1);
592 if ((error
= vn_lock(vp
, flags
| LK_FAILRECLAIM
)) != 0) {
594 * The lock failed, undo and return an error. This will not
595 * normally trigger a termination.
598 } else if (vp
->v_flag
& VRECLAIMED
) {
600 * The node is being reclaimed and cannot be reactivated
601 * any more, undo and return ENOENT.
606 } else if (vp
->v_state
== VS_ACTIVE
) {
608 * A VS_ACTIVE vnode coupled with the fact that we have
609 * a vnode lock (even if shared) prevents v_state from
610 * changing. Since the vnode is not in a VRECLAIMED state,
611 * we can safely clear VINACTIVE.
613 * It is possible for a shared lock to cause a race with
614 * another thread that is also in the process of clearing
615 * VREF_TERMINATE, meaning that we might return with it still
616 * set and then assert in a later vref(). The solution is to
617 * unconditionally clear VREF_TERMINATE here as well.
619 * NOTE! Multiple threads may clear VINACTIVE if this is
620 * shared lock. This race is allowed.
622 if (vp
->v_flag
& VINACTIVE
)
623 _vclrflags(vp
, VINACTIVE
); /* SMP race ok */
624 if (vp
->v_act
< VACT_MAX
) {
625 vp
->v_act
+= VACT_INC
;
626 if (vp
->v_act
> VACT_MAX
) /* SMP race ok */
627 vp
->v_act
= VACT_MAX
;
630 if (vp
->v_refcnt
& VREF_TERMINATE
) /* SMP race ok */
631 atomic_clear_int(&vp
->v_refcnt
, VREF_TERMINATE
);
634 * If the vnode is not VS_ACTIVE it must be reactivated
635 * in addition to clearing VINACTIVE. An exclusive spin_lock
636 * is needed to manipulate the vnode's list.
638 * Because the lockmgr lock might be shared, we might race
639 * another reactivation, which we handle. In this situation,
640 * however, the refcnt prevents other v_state races.
642 * As with above, clearing VINACTIVE is allowed to race other
643 * clearings of VINACTIVE.
645 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
646 * the refcnt is non-zero and the vnode has not been
647 * reclaimed. This also means that the transitions do
648 * not affect cachedvnodes.
650 * It is possible for a shared lock to cause a race with
651 * another thread that is also in the process of clearing
652 * VREF_TERMINATE, meaning that we might return with it still
653 * set and then assert in a later vref(). The solution is to
654 * unconditionally clear VREF_TERMINATE here as well.
656 _vclrflags(vp
, VINACTIVE
);
657 vp
->v_act
+= VACT_INC
;
658 if (vp
->v_act
> VACT_MAX
) /* SMP race ok */
659 vp
->v_act
= VACT_MAX
;
660 spin_lock(&vp
->v_spin
);
662 switch(vp
->v_state
) {
665 atomic_clear_int(&vp
->v_refcnt
, VREF_TERMINATE
|
667 spin_unlock(&vp
->v_spin
);
671 atomic_clear_int(&vp
->v_refcnt
, VREF_TERMINATE
|
673 spin_unlock(&vp
->v_spin
);
676 atomic_clear_int(&vp
->v_refcnt
, VREF_FINALIZE
|
678 spin_unlock(&vp
->v_spin
);
681 spin_unlock(&vp
->v_spin
);
682 panic("Impossible VS_DYING state");
693 debug_vput(struct vnode
*vp
, const char *filename
, int line
)
695 kprintf("vput(%p) %s:%d\n", vp
, filename
, line
);
703 vput(struct vnode
*vp
)
712 * Acquire the vnode lock unguarded.
714 * The non-blocking version also uses a slightly different mechanic.
715 * This function will explicitly fail not only if it cannot acquire
716 * the lock normally, but also if the caller already holds a lock.
718 * The adjusted mechanic is used to close a loophole where complex
719 * VOP_RECLAIM code can circle around recursively and allocate the
720 * same vnode it is trying to destroy from the freelist.
722 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
723 * cause the incorrect behavior to occur. If not for that lockmgr()
724 * would do the right thing.
726 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
729 vx_get(struct vnode
*vp
)
731 if ((atomic_fetchadd_int(&vp
->v_refcnt
, 1) & VREF_MASK
) == 0)
732 atomic_add_int(&mycpu
->gd_cachedvnodes
, -1);
733 lockmgr(&vp
->v_lock
, LK_EXCLUSIVE
);
734 spin_lock_update_only(&vp
->v_spin
);
738 vx_get_nonblock(struct vnode
*vp
)
742 if (lockinuse(&vp
->v_lock
))
744 error
= lockmgr(&vp
->v_lock
, LK_EXCLUSIVE
| LK_NOWAIT
);
746 spin_lock_update_only(&vp
->v_spin
);
747 if ((atomic_fetchadd_int(&vp
->v_refcnt
, 1) & VREF_MASK
) == 0)
748 atomic_add_int(&mycpu
->gd_cachedvnodes
, -1);
754 * Release a VX lock that also held a ref on the vnode. vrele() will handle
755 * any needed state transitions.
757 * However, filesystems use this function to get rid of unwanted new vnodes
758 * so try to get the vnode on the correct queue in that case.
761 vx_put(struct vnode
*vp
)
763 if (vp
->v_type
== VNON
|| vp
->v_type
== VBAD
)
764 atomic_set_int(&vp
->v_refcnt
, VREF_FINALIZE
);
765 spin_unlock_update_only(&vp
->v_spin
);
766 lockmgr(&vp
->v_lock
, LK_RELEASE
);
771 * Try to reuse a vnode from the free list. This function is somewhat
772 * advisory in that NULL can be returned as a normal case, even if free
773 * vnodes are present.
775 * The scan is limited because it can result in excessive CPU use during
776 * periods of extreme vnode use.
778 * NOTE: The returned vnode is not completely initialized.
779 * The returned vnode will be VX locked.
783 cleanfreevnode(int maxcount
)
785 struct vnode_index
*vi
;
788 int trigger
= (long)vmstats
.v_page_count
/ (activevnodes
* 2 + 1);
794 * Try to deactivate some vnodes cached on the active list. We
795 * generally want a 50-50 balance active vs inactive.
797 cachedvnodes
= countcachedvnodes();
798 if (cachedvnodes
< inactivevnodes
)
801 ri
= vnode_list_hash
[mycpu
->gd_cpuid
].deac_rover
+ 1;
803 for (count
= 0; count
< maxcount
* 2; ++count
, ++ri
) {
804 vi
= &vnode_list_hash
[((unsigned)ri
>> 4) % ncpus
];
806 spin_lock(&vi
->spin
);
808 vp
= TAILQ_NEXT(&vi
->active_rover
, v_list
);
809 TAILQ_REMOVE(&vi
->active_list
, &vi
->active_rover
, v_list
);
811 TAILQ_INSERT_HEAD(&vi
->active_list
,
812 &vi
->active_rover
, v_list
);
814 TAILQ_INSERT_AFTER(&vi
->active_list
, vp
,
815 &vi
->active_rover
, v_list
);
818 spin_unlock(&vi
->spin
);
823 * Don't try to deactivate if someone has the vp referenced.
825 if ((vp
->v_refcnt
& VREF_MASK
) != 0) {
826 spin_unlock(&vi
->spin
);
827 vp
->v_act
+= VACT_INC
;
828 if (vp
->v_act
> VACT_MAX
) /* SMP race ok */
829 vp
->v_act
= VACT_MAX
;
834 * Calculate the deactivation weight. Reduce v_act less
835 * if the vnode's object has a lot of VM pages.
842 if ((obj
= vp
->v_object
) != NULL
&&
843 obj
->resident_page_count
>= trigger
)
847 vp
->v_act
-= VACT_INC
;
851 spin_unlock(&vi
->spin
);
856 * If v_auxrefs is not the expected value the vnode might
857 * reside in the namecache topology on an internal node and
858 * not at a leaf. v_auxrefs can be wrong for other reasons,
859 * but this is the most likely.
861 * Such vnodes will not be recycled by vnlru later on in
862 * its inactive scan, so try to make the vnode presentable
863 * and only move it to the inactive queue if we can.
865 * On success, the vnode is disconnected from the namecache
866 * topology entirely, making vnodes above it in the topology
867 * recycleable. This will allow the active scan to continue
868 * to make progress in balancing the active and inactive
871 if (vp
->v_auxrefs
!= vp
->v_namecache_count
) {
872 if (vx_get_nonblock(vp
) == 0) {
873 spin_unlock(&vi
->spin
);
874 if ((vp
->v_refcnt
& VREF_MASK
) == 1)
875 cache_inval_vp_quick(vp
);
876 if (vp
->v_auxrefs
== vp
->v_namecache_count
)
880 spin_unlock(&vi
->spin
);
886 * Try to deactivate the vnode. It is ok if v_auxrefs
887 * races every once in a while, we just don't want an
888 * excess of unreclaimable vnodes on the inactive list.
890 if ((atomic_fetchadd_int(&vp
->v_refcnt
, 1) & VREF_MASK
) == 0)
891 atomic_add_int(&mycpu
->gd_cachedvnodes
, -1);
892 atomic_set_int(&vp
->v_refcnt
, VREF_FINALIZE
);
894 spin_unlock(&vi
->spin
);
898 vnode_list_hash
[mycpu
->gd_cpuid
].deac_rover
= ri
;
902 * Loop trying to lock the first vnode on the free list.
906 ri
= vnode_list_hash
[mycpu
->gd_cpuid
].free_rover
+ 1;
908 for (count
= 0; count
< maxcount
; ++count
, ++ri
) {
909 vi
= &vnode_list_hash
[((unsigned)ri
>> 4) % ncpus
];
911 spin_lock(&vi
->spin
);
913 vp
= TAILQ_FIRST(&vi
->inactive_list
);
915 spin_unlock(&vi
->spin
);
916 if (--cpu_count
== 0)
918 ri
= (ri
+ 16) & ~15;
924 * non-blocking vx_get will also ref the vnode on success.
926 if (vx_get_nonblock(vp
)) {
927 KKASSERT(vp
->v_state
== VS_INACTIVE
);
928 TAILQ_REMOVE(&vi
->inactive_list
, vp
, v_list
);
929 TAILQ_INSERT_TAIL(&vi
->inactive_list
, vp
, v_list
);
930 spin_unlock(&vi
->spin
);
935 * Because we are holding vfs_spin the vnode should currently
936 * be inactive and VREF_TERMINATE should still be set.
938 * Once vfs_spin is released the vnode's state should remain
939 * unmodified due to both the lock and ref on it.
941 KKASSERT(vp
->v_state
== VS_INACTIVE
);
942 spin_unlock(&vi
->spin
);
944 if ((u_long
)vp
== trackvnode
)
945 kprintf("cleanfreevnode %p %08x\n", vp
, vp
->v_flag
);
949 * The active scan already did this, but some leakage can
950 * happen. Don't let an easily recycleable vnode go to
953 if (vp
->v_auxrefs
!= vp
->v_namecache_count
&&
954 (vp
->v_refcnt
& ~VREF_FINALIZE
) == VREF_TERMINATE
+ 1)
956 cache_inval_vp_quick(vp
);
957 if (vp
->v_auxrefs
== vp
->v_namecache_count
)
962 * Do not reclaim/reuse a vnode while auxillary refs exists.
963 * This includes namecache refs due to a related ncp being
964 * locked or having children, a VM object association, or
967 * Do not reclaim/reuse a vnode if someone else has a real
968 * ref on it. This can occur if a filesystem temporarily
969 * releases the vnode lock during VOP_RECLAIM.
971 if (vp
->v_auxrefs
!= vp
->v_namecache_count
||
972 (vp
->v_refcnt
& ~VREF_FINALIZE
) != VREF_TERMINATE
+ 1) {
974 if (vp
->v_state
== VS_INACTIVE
) {
975 spin_lock(&vi
->spin
);
976 if (vp
->v_state
== VS_INACTIVE
) {
977 TAILQ_REMOVE(&vi
->inactive_list
,
979 TAILQ_INSERT_TAIL(&vi
->inactive_list
,
982 spin_unlock(&vi
->spin
);
989 * VINACTIVE and VREF_TERMINATE are expected to both be set
990 * for vnodes pulled from the inactive list, and cannot be
991 * changed while we hold the vx lock.
993 * Try to reclaim the vnode.
995 * The cache_inval_vp() can fail if any of the namecache
996 * elements are actively locked, preventing the vnode from
997 * bring reclaimed. This is desired operation as it gives
998 * the namecache code certain guarantees just by holding
1001 KKASSERT(vp
->v_flag
& VINACTIVE
);
1002 KKASSERT(vp
->v_refcnt
& VREF_TERMINATE
);
1004 if ((vp
->v_flag
& VRECLAIMED
) == 0) {
1005 if (cache_inval_vp_nonblock(vp
))
1008 /* vnode is still VX locked */
1012 * At this point if there are no other refs or auxrefs on
1013 * the vnode with the inactive list locked, and we remove
1014 * the vnode from the inactive list, it should not be
1015 * possible for anyone else to access the vnode any more.
1017 * Since the vnode is in a VRECLAIMED state, no new
1018 * namecache associations could have been made and the
1019 * vnode should have already been removed from its mountlist.
1021 * Since we hold a VX lock on the vnode it cannot have been
1022 * reactivated (moved out of the inactive list).
1024 KKASSERT(TAILQ_EMPTY(&vp
->v_namecache
));
1025 spin_lock(&vi
->spin
);
1026 if (vp
->v_auxrefs
||
1027 (vp
->v_refcnt
& ~VREF_FINALIZE
) != VREF_TERMINATE
+ 1) {
1028 spin_unlock(&vi
->spin
);
1031 KKASSERT(vp
->v_state
== VS_INACTIVE
);
1032 TAILQ_REMOVE(&vi
->inactive_list
, vp
, v_list
);
1033 atomic_add_int(&mycpu
->gd_inactivevnodes
, -1);
1034 vp
->v_state
= VS_DYING
;
1035 spin_unlock(&vi
->spin
);
1038 * Nothing should have been able to access this vp. Only
1039 * our ref should remain now.
1041 atomic_clear_int(&vp
->v_refcnt
, VREF_TERMINATE
|VREF_FINALIZE
);
1042 KASSERT(vp
->v_refcnt
== 1,
1043 ("vp %p badrefs %08x", vp
, vp
->v_refcnt
));
1046 * Return a VX locked vnode suitable for reuse.
1048 vnode_list_hash
[mycpu
->gd_cpuid
].free_rover
= ri
;
1051 vnode_list_hash
[mycpu
->gd_cpuid
].free_rover
= ri
;
1056 * Obtain a new vnode. The returned vnode is VX locked & vrefd.
1058 * All new vnodes set the VAGE flags. An open() of the vnode will
1059 * decrement the (2-bit) flags. Vnodes which are opened several times
1060 * are thus retained in the cache over vnodes which are merely stat()d.
1062 * We attempt to reuse an already-recycled vnode from our pcpu inactive
1063 * queue first, and allocate otherwise. Attempting to recycle inactive
1064 * vnodes here can lead to numerous deadlocks, particularly with
1068 allocvnode(int lktimeout
, int lkflags
)
1071 struct vnode_index
*vi
;
1074 * lktimeout only applies when LK_TIMELOCK is used, and only
1075 * the pageout daemon uses it. The timeout may not be zero
1076 * or the pageout daemon can deadlock in low-VM situations.
1079 lktimeout
= hz
/ 10;
1082 * Do not flag for synchronous recyclement unless there are enough
1083 * freeable vnodes to recycle and the number of vnodes has
1084 * significantly exceeded our target. We want the normal vnlru
1085 * process to handle the cleaning (at 9/10's) before we are forced
1086 * to flag it here at 11/10's for userexit path processing.
1088 if (numvnodes
>= maxvnodes
* 11 / 10 &&
1089 cachedvnodes
+ inactivevnodes
>= maxvnodes
* 5 / 10) {
1090 struct thread
*td
= curthread
;
1092 atomic_set_int(&td
->td_lwp
->lwp_mpflags
, LWP_MP_VNLRU
);
1096 * Try to trivially reuse a reclaimed vnode from the head of the
1097 * inactive list for this cpu. Any vnode cycling which occurs
1098 * which terminates the vnode will cause it to be returned to the
1099 * same pcpu structure (e.g. unlink calls).
1101 vi
= &vnode_list_hash
[mycpuid
];
1102 spin_lock(&vi
->spin
);
1104 vp
= TAILQ_FIRST(&vi
->inactive_list
);
1105 if (vp
&& (vp
->v_flag
& VRECLAIMED
)) {
1107 * non-blocking vx_get will also ref the vnode on success.
1109 if (vx_get_nonblock(vp
)) {
1110 KKASSERT(vp
->v_state
== VS_INACTIVE
);
1111 TAILQ_REMOVE(&vi
->inactive_list
, vp
, v_list
);
1112 TAILQ_INSERT_TAIL(&vi
->inactive_list
, vp
, v_list
);
1113 spin_unlock(&vi
->spin
);
1118 * Because we are holding vfs_spin the vnode should currently
1119 * be inactive and VREF_TERMINATE should still be set.
1121 * Once vfs_spin is released the vnode's state should remain
1122 * unmodified due to both the lock and ref on it.
1124 KKASSERT(vp
->v_state
== VS_INACTIVE
);
1126 if ((u_long
)vp
== trackvnode
)
1127 kprintf("allocvnode %p %08x\n", vp
, vp
->v_flag
);
1131 * Do not reclaim/reuse a vnode while auxillary refs exists.
1132 * This includes namecache refs due to a related ncp being
1133 * locked or having children, a VM object association, or
1136 * Do not reclaim/reuse a vnode if someone else has a real
1137 * ref on it. This can occur if a filesystem temporarily
1138 * releases the vnode lock during VOP_RECLAIM.
1140 if (vp
->v_auxrefs
||
1141 (vp
->v_refcnt
& ~VREF_FINALIZE
) != VREF_TERMINATE
+ 1) {
1142 if (vp
->v_state
== VS_INACTIVE
) {
1143 TAILQ_REMOVE(&vi
->inactive_list
,
1145 TAILQ_INSERT_TAIL(&vi
->inactive_list
,
1148 spin_unlock(&vi
->spin
);
1154 * VINACTIVE and VREF_TERMINATE are expected to both be set
1155 * for vnodes pulled from the inactive list, and cannot be
1156 * changed while we hold the vx lock.
1158 * Try to reclaim the vnode.
1160 KKASSERT(vp
->v_flag
& VINACTIVE
);
1161 KKASSERT(vp
->v_refcnt
& VREF_TERMINATE
);
1163 if ((vp
->v_flag
& VRECLAIMED
) == 0) {
1164 spin_unlock(&vi
->spin
);
1170 * At this point if there are no other refs or auxrefs on
1171 * the vnode with the inactive list locked, and we remove
1172 * the vnode from the inactive list, it should not be
1173 * possible for anyone else to access the vnode any more.
1175 * Since the vnode is in a VRECLAIMED state, no new
1176 * namecache associations could have been made and the
1177 * vnode should have already been removed from its mountlist.
1179 * Since we hold a VX lock on the vnode it cannot have been
1180 * reactivated (moved out of the inactive list).
1182 KKASSERT(TAILQ_EMPTY(&vp
->v_namecache
));
1183 KKASSERT(vp
->v_state
== VS_INACTIVE
);
1184 TAILQ_REMOVE(&vi
->inactive_list
, vp
, v_list
);
1185 atomic_add_int(&mycpu
->gd_inactivevnodes
, -1);
1186 vp
->v_state
= VS_DYING
;
1187 spin_unlock(&vi
->spin
);
1190 * Nothing should have been able to access this vp. Only
1191 * our ref should remain now.
1193 * At this point we can kfree() the vnode if we want to.
1194 * Instead, we reuse it for the allocation.
1196 atomic_clear_int(&vp
->v_refcnt
, VREF_TERMINATE
|VREF_FINALIZE
);
1197 KASSERT(vp
->v_refcnt
== 1,
1198 ("vp %p badrefs %08x", vp
, vp
->v_refcnt
));
1199 vx_unlock(vp
); /* safety: keep the API clean */
1200 bzero(vp
, sizeof(*vp
));
1202 spin_unlock(&vi
->spin
);
1204 vp
= kmalloc_obj(sizeof(*vp
), M_VNODE
, M_ZERO
| M_WAITOK
);
1205 atomic_add_int(&numvnodes
, 1);
1208 lwkt_token_init(&vp
->v_token
, "vnode");
1209 lockinit(&vp
->v_lock
, "vnode", lktimeout
, lkflags
);
1210 TAILQ_INIT(&vp
->v_namecache
);
1211 RB_INIT(&vp
->v_rbclean_tree
);
1212 RB_INIT(&vp
->v_rbdirty_tree
);
1213 RB_INIT(&vp
->v_rbhash_tree
);
1214 spin_init(&vp
->v_spin
, "allocvnode");
1218 vp
->v_flag
= VAGE0
| VAGE1
;
1219 vp
->v_pbuf_count
= nswbuf_kva
/ NSWBUF_SPLIT
;
1221 KKASSERT(TAILQ_EMPTY(&vp
->v_namecache
));
1222 /* exclusive lock still held */
1224 vp
->v_filesize
= NOOFFSET
;
1227 vp
->v_state
= VS_CACHED
;
1234 * Called after a process has allocated a vnode via allocvnode()
1235 * and we detected that too many vnodes were present.
1237 * This function is called just prior to a return to userland if the
1238 * process at some point had to allocate a new vnode during the last
1239 * system call and the vnode count was found to be excessive.
1241 * This is a synchronous path that we do not normally want to execute.
1243 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1245 * WARNING: Sometimes numvnodes can blow out due to children being
1246 * present under directory vnodes in the namecache. For the
1247 * moment use an if() instead of a while() and note that if
1248 * we were to use a while() we would still have to break out
1249 * if freesomevnodes() returned 0. vnlru will also be trying
1250 * hard to free vnodes at the same time (with a lower trigger
1256 if (numvnodes
>= maxvnodes
&&
1257 countcachedandinactivevnodes() >= maxvnodes
* 5 / 10)
1259 freesomevnodes(batchfreevnodes
);
1264 freesomevnodes(int n
)
1270 if ((vp
= cleanfreevnode(n
)) == NULL
)
1275 kfree_obj(vp
, M_VNODE
);
1276 atomic_add_int(&numvnodes
, -1);