kernel - Fix excessive call stack depth on stuck interrupt
[dragonfly.git] / sys / kern / vfs_lock.c
blob10df38069d6599486edefaeefa5949849c8cc115
1 /*
2 * Copyright (c) 2004,2013 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * External lock/ref-related vnode functions
38 * vs_state transition locking requirements:
40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vfs_spin
41 * DYING -> CACHED vx_lock(excl)
42 * ACTIVE -> INACTIVE (none) + v_spin + vfs_spin
43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vfs_spin
44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vfs_spin
46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vfs_spin,
48 * Switching into ACTIVE also requires a vref and vnode lock, however
49 * the vnode lock is allowed to be SHARED.
51 * Switching into a CACHED or DYING state requires an exclusive vnode
52 * lock or vx_lock (which is almost the same thing).
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/buf.h>
63 #include <sys/sysctl.h>
65 #include <machine/limits.h>
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
70 #include <sys/buf2.h>
71 #include <sys/thread2.h>
73 #define VACT_MAX 10
74 #define VACT_INC 2
76 static void vnode_terminate(struct vnode *vp);
78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
81 * The vnode free list hold inactive vnodes. Aged inactive vnodes
82 * are inserted prior to the mid point, and otherwise inserted
83 * at the tail.
85 TAILQ_HEAD(freelst, vnode);
86 static struct freelst vnode_active_list;
87 static struct freelst vnode_inactive_list;
88 static struct vnode vnode_active_rover;
89 static struct spinlock vfs_spin = SPINLOCK_INITIALIZER(vfs_spin, "vfs_spin");
91 int activevnodes = 0;
92 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
93 &activevnodes, 0, "Number of active nodes");
94 int cachedvnodes = 0;
95 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
96 &cachedvnodes, 0, "Number of total cached nodes");
97 int inactivevnodes = 0;
98 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
99 &inactivevnodes, 0, "Number of inactive nodes");
100 static int batchfreevnodes = 5;
101 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
102 &batchfreevnodes, 0, "Number of vnodes to free at once");
103 #ifdef TRACKVNODE
104 static u_long trackvnode;
105 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
106 &trackvnode, 0, "");
107 #endif
110 * Called from vfsinit()
112 void
113 vfs_lock_init(void)
115 TAILQ_INIT(&vnode_inactive_list);
116 TAILQ_INIT(&vnode_active_list);
117 TAILQ_INSERT_TAIL(&vnode_active_list, &vnode_active_rover, v_list);
118 spin_init(&vfs_spin, "vfslock");
119 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */
123 * Misc functions
125 static __inline
126 void
127 _vsetflags(struct vnode *vp, int flags)
129 atomic_set_int(&vp->v_flag, flags);
132 static __inline
133 void
134 _vclrflags(struct vnode *vp, int flags)
136 atomic_clear_int(&vp->v_flag, flags);
139 void
140 vsetflags(struct vnode *vp, int flags)
142 _vsetflags(vp, flags);
145 void
146 vclrflags(struct vnode *vp, int flags)
148 _vclrflags(vp, flags);
152 * Place the vnode on the active list.
154 * Caller must hold vp->v_spin
156 static __inline
157 void
158 _vactivate(struct vnode *vp)
160 #ifdef TRACKVNODE
161 if ((u_long)vp == trackvnode)
162 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
163 #endif
164 spin_lock(&vfs_spin);
166 switch(vp->v_state) {
167 case VS_ACTIVE:
168 panic("_vactivate: already active");
169 /* NOT REACHED */
170 spin_unlock(&vfs_spin);
171 return;
172 case VS_INACTIVE:
173 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
174 --inactivevnodes;
175 break;
176 case VS_CACHED:
177 case VS_DYING:
178 break;
180 TAILQ_INSERT_TAIL(&vnode_active_list, vp, v_list);
181 vp->v_state = VS_ACTIVE;
182 ++activevnodes;
184 spin_unlock(&vfs_spin);
188 * Put a vnode on the inactive list.
190 * Caller must hold v_spin
192 static __inline
193 void
194 _vinactive(struct vnode *vp)
196 #ifdef TRACKVNODE
197 if ((u_long)vp == trackvnode) {
198 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
199 print_backtrace(-1);
201 #endif
202 spin_lock(&vfs_spin);
205 * Remove from active list if it is sitting on it
207 switch(vp->v_state) {
208 case VS_ACTIVE:
209 TAILQ_REMOVE(&vnode_active_list, vp, v_list);
210 --activevnodes;
211 break;
212 case VS_INACTIVE:
213 panic("_vinactive: already inactive");
214 /* NOT REACHED */
215 spin_unlock(&vfs_spin);
216 return;
217 case VS_CACHED:
218 case VS_DYING:
219 break;
223 * Distinguish between basically dead vnodes, vnodes with cached
224 * data, and vnodes without cached data. A rover will shift the
225 * vnodes around as their cache status is lost.
227 if (vp->v_flag & VRECLAIMED) {
228 TAILQ_INSERT_HEAD(&vnode_inactive_list, vp, v_list);
229 } else {
230 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
232 ++inactivevnodes;
233 vp->v_state = VS_INACTIVE;
235 spin_unlock(&vfs_spin);
238 static __inline
239 void
240 _vinactive_tail(struct vnode *vp)
242 spin_lock(&vfs_spin);
245 * Remove from active list if it is sitting on it
247 switch(vp->v_state) {
248 case VS_ACTIVE:
249 TAILQ_REMOVE(&vnode_active_list, vp, v_list);
250 --activevnodes;
251 break;
252 case VS_INACTIVE:
253 panic("_vinactive_tail: already inactive");
254 /* NOT REACHED */
255 spin_unlock(&vfs_spin);
256 return;
257 case VS_CACHED:
258 case VS_DYING:
259 break;
262 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
263 ++inactivevnodes;
264 vp->v_state = VS_INACTIVE;
266 spin_unlock(&vfs_spin);
270 * Add a ref to an active vnode. This function should never be called
271 * with an inactive vnode (use vget() instead), but might be called
272 * with other states.
274 void
275 vref(struct vnode *vp)
277 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
278 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
279 atomic_add_int(&vp->v_refcnt, 1);
283 * Count number of cached vnodes. This is middling expensive so be
284 * careful not to make this call in the critical path, particularly
285 * not updating the global. Each cpu tracks its own accumulator.
286 * The individual accumulators are not accurate and must be summed
287 * together.
290 countcachedvnodes(int gupdate)
292 int i;
293 int n = 0;
295 for (i = 0; i < ncpus; ++i) {
296 globaldata_t gd = globaldata_find(i);
297 n += gd->gd_cachedvnodes;
299 if (gupdate)
300 cachedvnodes = n;
301 return n;
305 * Release a ref on an active or inactive vnode.
307 * Caller has no other requirements.
309 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
310 * transition, otherwise we leave the vnode in the active list and
311 * do a lockless transition to 0, which is very important for the
312 * critical path.
314 * (vrele() is not called when a vnode is being destroyed w/kfree)
316 void
317 vrele(struct vnode *vp)
319 for (;;) {
320 int count = vp->v_refcnt;
321 cpu_ccfence();
322 KKASSERT((count & VREF_MASK) > 0);
323 KKASSERT(vp->v_state == VS_ACTIVE ||
324 vp->v_state == VS_INACTIVE);
327 * 2+ case
329 if ((count & VREF_MASK) > 1) {
330 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
331 break;
332 continue;
336 * 1->0 transition case must handle possible finalization.
337 * When finalizing we transition 1->0x40000000. Note that
338 * cachedvnodes is only adjusted on transitions to ->0.
340 * WARNING! VREF_TERMINATE can be cleared at any point
341 * when the refcnt is non-zero (by vget()) and
342 * the vnode has not been reclaimed. Thus
343 * transitions out of VREF_TERMINATE do not have
344 * to mess with cachedvnodes.
346 if (count & VREF_FINALIZE) {
347 vx_lock(vp);
348 if (atomic_cmpset_int(&vp->v_refcnt,
349 count, VREF_TERMINATE)) {
350 vnode_terminate(vp);
351 break;
353 vx_unlock(vp);
354 } else {
355 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
356 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
357 break;
360 /* retry */
365 * Add an auxiliary data structure reference to the vnode. Auxiliary
366 * references do not change the state of the vnode or prevent deactivation
367 * or reclamation of the vnode, but will prevent the vnode from being
368 * destroyed (kfree()'d).
370 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not
371 * already be held by the caller. vdrop() will clean up the
372 * free list state.
374 void
375 vhold(struct vnode *vp)
377 atomic_add_int(&vp->v_auxrefs, 1);
381 * Remove an auxiliary reference from the vnode.
383 void
384 vdrop(struct vnode *vp)
386 atomic_add_int(&vp->v_auxrefs, -1);
390 * This function is called on the 1->0 transition (which is actually
391 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
392 * of the vnode.
394 * Additional vrefs are allowed to race but will not result in a reentrant
395 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This
396 * prevents additional 1->0 transitions.
398 * ONLY A VGET() CAN REACTIVATE THE VNODE.
400 * Caller must hold the VX lock.
402 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
404 * NOTE: The vnode may be marked inactive with dirty buffers
405 * or dirty pages in its cached VM object still present.
407 * NOTE: VS_FREE should not be set on entry (the vnode was expected to
408 * previously be active). We lose control of the vnode the instant
409 * it is placed on the free list.
411 * The VX lock is required when transitioning to VS_CACHED but is
412 * not sufficient for the vshouldfree() interlocked test or when
413 * transitioning away from VS_CACHED. v_spin is also required for
414 * those cases.
416 static
417 void
418 vnode_terminate(struct vnode *vp)
420 KKASSERT(vp->v_state == VS_ACTIVE);
422 if ((vp->v_flag & VINACTIVE) == 0) {
423 _vsetflags(vp, VINACTIVE);
424 if (vp->v_mount)
425 VOP_INACTIVE(vp);
426 /* might deactivate page */
428 spin_lock(&vp->v_spin);
429 _vinactive(vp);
430 spin_unlock(&vp->v_spin);
432 vx_unlock(vp);
435 /****************************************************************
436 * VX LOCKING FUNCTIONS *
437 ****************************************************************
439 * These functions lock vnodes for reclamation and deactivation related
440 * activities. The caller must already be holding some sort of reference
441 * on the vnode.
443 void
444 vx_lock(struct vnode *vp)
446 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
449 void
450 vx_unlock(struct vnode *vp)
452 lockmgr(&vp->v_lock, LK_RELEASE);
455 /****************************************************************
456 * VNODE ACQUISITION FUNCTIONS *
457 ****************************************************************
459 * These functions must be used when accessing a vnode that has no
460 * chance of being destroyed in a SMP race. That means the caller will
461 * usually either hold an auxiliary reference (such as the namecache)
462 * or hold some other lock that ensures that the vnode cannot be destroyed.
464 * These functions are MANDATORY for any code chain accessing a vnode
465 * whos activation state is not known.
467 * vget() can be called with LK_NOWAIT and will return EBUSY if the
468 * lock cannot be immediately acquired.
470 * vget()/vput() are used when reactivation is desired.
472 * vx_get() and vx_put() are used when reactivation is not desired.
475 vget(struct vnode *vp, int flags)
477 int error;
480 * A lock type must be passed
482 if ((flags & LK_TYPE_MASK) == 0) {
483 panic("vget() called with no lock specified!");
484 /* NOT REACHED */
488 * Reference the structure and then acquire the lock.
490 * NOTE: The requested lock might be a shared lock and does
491 * not protect our access to the refcnt or other fields.
493 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
494 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
496 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
498 * The lock failed, undo and return an error. This will not
499 * normally trigger a termination.
501 vrele(vp);
502 } else if (vp->v_flag & VRECLAIMED) {
504 * The node is being reclaimed and cannot be reactivated
505 * any more, undo and return ENOENT.
507 vn_unlock(vp);
508 vrele(vp);
509 error = ENOENT;
510 } else if (vp->v_state == VS_ACTIVE) {
512 * A VS_ACTIVE vnode coupled with the fact that we have
513 * a vnode lock (even if shared) prevents v_state from
514 * changing. Since the vnode is not in a VRECLAIMED state,
515 * we can safely clear VINACTIVE.
517 * NOTE! Multiple threads may clear VINACTIVE if this is
518 * shared lock. This race is allowed.
520 _vclrflags(vp, VINACTIVE); /* SMP race ok */
521 vp->v_act += VACT_INC;
522 if (vp->v_act > VACT_MAX) /* SMP race ok */
523 vp->v_act = VACT_MAX;
524 error = 0;
525 } else {
527 * If the vnode is not VS_ACTIVE it must be reactivated
528 * in addition to clearing VINACTIVE. An exclusive spin_lock
529 * is needed to manipulate the vnode's list.
531 * Because the lockmgr lock might be shared, we might race
532 * another reactivation, which we handle. In this situation,
533 * however, the refcnt prevents other v_state races.
535 * As with above, clearing VINACTIVE is allowed to race other
536 * clearings of VINACTIVE.
538 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
539 * the refcnt is non-zero and the vnode has not been
540 * reclaimed. This also means that the transitions do
541 * not affect cachedvnodes.
543 _vclrflags(vp, VINACTIVE);
544 vp->v_act += VACT_INC;
545 if (vp->v_act > VACT_MAX) /* SMP race ok */
546 vp->v_act = VACT_MAX;
547 spin_lock(&vp->v_spin);
549 switch(vp->v_state) {
550 case VS_INACTIVE:
551 _vactivate(vp);
552 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
553 VREF_FINALIZE);
554 spin_unlock(&vp->v_spin);
555 break;
556 case VS_CACHED:
557 _vactivate(vp);
558 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
559 VREF_FINALIZE);
560 spin_unlock(&vp->v_spin);
561 break;
562 case VS_ACTIVE:
563 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE);
564 spin_unlock(&vp->v_spin);
565 break;
566 case VS_DYING:
567 spin_unlock(&vp->v_spin);
568 panic("Impossible VS_DYING state");
569 break;
571 error = 0;
573 return(error);
576 #ifdef DEBUG_VPUT
578 void
579 debug_vput(struct vnode *vp, const char *filename, int line)
581 kprintf("vput(%p) %s:%d\n", vp, filename, line);
582 vn_unlock(vp);
583 vrele(vp);
586 #else
588 void
589 vput(struct vnode *vp)
591 vn_unlock(vp);
592 vrele(vp);
595 #endif
598 * Acquire the vnode lock unguarded.
600 * The non-blocking version also uses a slightly different mechanic.
601 * This function will explicitly fail not only if it cannot acquire
602 * the lock normally, but also if the caller already holds a lock.
604 * The adjusted mechanic is used to close a loophole where complex
605 * VOP_RECLAIM code can circle around recursively and allocate the
606 * same vnode it is trying to destroy from the freelist.
608 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
609 * cause the incorrect behavior to occur. If not for that lockmgr()
610 * would do the right thing.
612 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
614 void
615 vx_get(struct vnode *vp)
617 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
618 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
619 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
623 vx_get_nonblock(struct vnode *vp)
625 int error;
627 if (lockcountnb(&vp->v_lock))
628 return(EBUSY);
629 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
630 if (error == 0) {
631 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
632 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
634 return(error);
638 * Release a VX lock that also held a ref on the vnode. vrele() will handle
639 * any needed state transitions.
641 * However, filesystems use this function to get rid of unwanted new vnodes
642 * so try to get the vnode on the correct queue in that case.
644 void
645 vx_put(struct vnode *vp)
647 if (vp->v_type == VNON || vp->v_type == VBAD)
648 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
649 lockmgr(&vp->v_lock, LK_RELEASE);
650 vrele(vp);
654 * Try to reuse a vnode from the free list. This function is somewhat
655 * advisory in that NULL can be returned as a normal case, even if free
656 * vnodes are present.
658 * The scan is limited because it can result in excessive CPU use during
659 * periods of extreme vnode use.
661 * NOTE: The returned vnode is not completely initialized.
663 static
664 struct vnode *
665 cleanfreevnode(int maxcount)
667 struct vnode *vp;
668 int count;
669 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
672 * Try to deactivate some vnodes cached on the active list.
674 if (countcachedvnodes(0) < inactivevnodes)
675 goto skip;
677 for (count = 0; count < maxcount * 2; count++) {
678 spin_lock(&vfs_spin);
680 vp = TAILQ_NEXT(&vnode_active_rover, v_list);
681 TAILQ_REMOVE(&vnode_active_list, &vnode_active_rover, v_list);
682 if (vp == NULL) {
683 TAILQ_INSERT_HEAD(&vnode_active_list,
684 &vnode_active_rover, v_list);
685 } else {
686 TAILQ_INSERT_AFTER(&vnode_active_list, vp,
687 &vnode_active_rover, v_list);
689 if (vp == NULL) {
690 spin_unlock(&vfs_spin);
691 continue;
693 if ((vp->v_refcnt & VREF_MASK) != 0) {
694 spin_unlock(&vfs_spin);
695 vp->v_act += VACT_INC;
696 if (vp->v_act > VACT_MAX) /* SMP race ok */
697 vp->v_act = VACT_MAX;
698 continue;
702 * decrement by less if the vnode's object has a lot of
703 * VM pages. XXX possible SMP races.
705 if (vp->v_act > 0) {
706 vm_object_t obj;
707 if ((obj = vp->v_object) != NULL &&
708 obj->resident_page_count >= trigger) {
709 vp->v_act -= 1;
710 } else {
711 vp->v_act -= VACT_INC;
713 if (vp->v_act < 0)
714 vp->v_act = 0;
715 spin_unlock(&vfs_spin);
716 continue;
720 * Try to deactivate the vnode.
722 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
723 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
724 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
726 spin_unlock(&vfs_spin);
727 vrele(vp);
730 skip:
732 * Loop trying to lock the first vnode on the free list.
733 * Cycle if we can't.
735 for (count = 0; count < maxcount; count++) {
736 spin_lock(&vfs_spin);
738 vp = TAILQ_FIRST(&vnode_inactive_list);
739 if (vp == NULL) {
740 spin_unlock(&vfs_spin);
741 break;
745 * non-blocking vx_get will also ref the vnode on success.
747 if (vx_get_nonblock(vp)) {
748 KKASSERT(vp->v_state == VS_INACTIVE);
749 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
750 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
751 spin_unlock(&vfs_spin);
752 continue;
756 * Because we are holding vfs_spin the vnode should currently
757 * be inactive and VREF_TERMINATE should still be set.
759 * Once vfs_spin is released the vnode's state should remain
760 * unmodified due to both the lock and ref on it.
762 KKASSERT(vp->v_state == VS_INACTIVE);
763 spin_unlock(&vfs_spin);
764 #ifdef TRACKVNODE
765 if ((u_long)vp == trackvnode)
766 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
767 #endif
770 * Do not reclaim/reuse a vnode while auxillary refs exists.
771 * This includes namecache refs due to a related ncp being
772 * locked or having children, a VM object association, or
773 * other hold users.
775 * Do not reclaim/reuse a vnode if someone else has a real
776 * ref on it. This can occur if a filesystem temporarily
777 * releases the vnode lock during VOP_RECLAIM.
779 if (vp->v_auxrefs ||
780 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
781 failed:
782 if (vp->v_state == VS_INACTIVE) {
783 spin_lock(&vfs_spin);
784 if (vp->v_state == VS_INACTIVE) {
785 TAILQ_REMOVE(&vnode_inactive_list,
786 vp, v_list);
787 TAILQ_INSERT_TAIL(&vnode_inactive_list,
788 vp, v_list);
790 spin_unlock(&vfs_spin);
792 vx_put(vp);
793 continue;
797 * VINACTIVE and VREF_TERMINATE are expected to both be set
798 * for vnodes pulled from the inactive list, and cannot be
799 * changed while we hold the vx lock.
801 * Try to reclaim the vnode.
803 KKASSERT(vp->v_flag & VINACTIVE);
804 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
806 if ((vp->v_flag & VRECLAIMED) == 0) {
807 if (cache_inval_vp_nonblock(vp))
808 goto failed;
809 vgone_vxlocked(vp);
810 /* vnode is still VX locked */
814 * At this point if there are no other refs or auxrefs on
815 * the vnode with the inactive list locked, and we remove
816 * the vnode from the inactive list, it should not be
817 * possible for anyone else to access the vnode any more.
819 * Since the vnode is in a VRECLAIMED state, no new
820 * namecache associations could have been made and the
821 * vnode should have already been removed from its mountlist.
823 * Since we hold a VX lock on the vnode it cannot have been
824 * reactivated (moved out of the inactive list).
826 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
827 spin_lock(&vfs_spin);
828 if (vp->v_auxrefs ||
829 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
830 spin_unlock(&vfs_spin);
831 goto failed;
833 KKASSERT(vp->v_state == VS_INACTIVE);
834 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
835 --inactivevnodes;
836 vp->v_state = VS_DYING;
837 spin_unlock(&vfs_spin);
840 * Nothing should have been able to access this vp. Only
841 * our ref should remain now.
843 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
844 KASSERT(vp->v_refcnt == 1,
845 ("vp %p badrefs %08x", vp, vp->v_refcnt));
848 * Return a VX locked vnode suitable for reuse.
850 return(vp);
852 return(NULL);
856 * Obtain a new vnode. The returned vnode is VX locked & vrefd.
858 * All new vnodes set the VAGE flags. An open() of the vnode will
859 * decrement the (2-bit) flags. Vnodes which are opened several times
860 * are thus retained in the cache over vnodes which are merely stat()d.
862 * We always allocate the vnode. Attempting to recycle existing vnodes
863 * here can lead to numerous deadlocks, particularly with softupdates.
865 struct vnode *
866 allocvnode(int lktimeout, int lkflags)
868 struct vnode *vp;
871 * Do not flag for synchronous recyclement unless there are enough
872 * freeable vnodes to recycle and the number of vnodes has
873 * significantly exceeded our target. We want the normal vnlru
874 * process to handle the cleaning (at 9/10's) before we are forced
875 * to flag it here at 11/10's for userexit path processing.
877 if (numvnodes >= maxvnodes * 11 / 10 &&
878 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
879 struct thread *td = curthread;
880 if (td->td_lwp)
881 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
885 * lktimeout only applies when LK_TIMELOCK is used, and only
886 * the pageout daemon uses it. The timeout may not be zero
887 * or the pageout daemon can deadlock in low-VM situations.
889 if (lktimeout == 0)
890 lktimeout = hz / 10;
892 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
894 lwkt_token_init(&vp->v_token, "vnode");
895 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
896 TAILQ_INIT(&vp->v_namecache);
897 RB_INIT(&vp->v_rbclean_tree);
898 RB_INIT(&vp->v_rbdirty_tree);
899 RB_INIT(&vp->v_rbhash_tree);
900 spin_init(&vp->v_spin, "allocvnode");
902 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
903 atomic_add_int(&numvnodes, 1);
904 vp->v_refcnt = 1;
905 vp->v_flag = VAGE0 | VAGE1;
906 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
908 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
909 /* exclusive lock still held */
911 vp->v_filesize = NOOFFSET;
912 vp->v_type = VNON;
913 vp->v_tag = 0;
914 vp->v_state = VS_CACHED;
915 _vactivate(vp);
917 return (vp);
921 * Called after a process has allocated a vnode via allocvnode()
922 * and we detected that too many vnodes were present.
924 * This function is called just prior to a return to userland if the
925 * process at some point had to allocate a new vnode during the last
926 * system call and the vnode count was found to be excessive.
928 * This is a synchronous path that we do not normally want to execute.
930 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
932 * WARNING: Sometimes numvnodes can blow out due to children being
933 * present under directory vnodes in the namecache. For the
934 * moment use an if() instead of a while() and note that if
935 * we were to use a while() we would still have to break out
936 * if freesomevnodes() returned 0. vnlru will also be trying
937 * hard to free vnodes at the same time (with a lower trigger
938 * pointer).
940 void
941 allocvnode_gc(void)
943 if (numvnodes >= maxvnodes &&
944 countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) {
945 freesomevnodes(batchfreevnodes);
950 freesomevnodes(int n)
952 struct vnode *vp;
953 int count = 0;
955 while (n) {
956 if ((vp = cleanfreevnode(n)) == NULL)
957 break;
958 vx_unlock(vp);
959 --n;
960 ++count;
961 kfree(vp, M_VNODE);
962 atomic_add_int(&numvnodes, -1);
964 return(count);