mtree/BSD.root.dist: Use spaces.
[dragonfly.git] / sys / kern / vfs_lock.c
blob8110e5f9b7b378420566969f66b1bca21a841954
1 /*
2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * External lock/ref-related vnode functions
38 * vs_state transition locking requirements:
40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin
41 * DYING -> CACHED vx_lock(excl)
42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin
43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin
44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin
46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
48 * Switching into ACTIVE also requires a vref and vnode lock, however
49 * the vnode lock is allowed to be SHARED.
51 * Switching into a CACHED or DYING state requires an exclusive vnode
52 * lock or vx_lock (which is almost the same thing).
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/buf.h>
63 #include <sys/sysctl.h>
65 #include <machine/limits.h>
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
70 #include <sys/buf2.h>
71 #include <sys/thread2.h>
73 #define VACT_MAX 10
74 #define VACT_INC 2
76 static void vnode_terminate(struct vnode *vp);
78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
81 * The vnode free list hold inactive vnodes. Aged inactive vnodes
82 * are inserted prior to the mid point, and otherwise inserted
83 * at the tail.
85 * The vnode code goes to great lengths to avoid moving vnodes between
86 * lists, but sometimes it is unavoidable. For this situation we try to
87 * avoid lock contention but we do not try very hard to avoid cache line
88 * congestion. A modestly sized hash table is used.
90 #define VLIST_PRIME2 123462047LU
91 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU
93 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \
94 VLIST_PRIME2 % (unsigned)ncpus)
96 TAILQ_HEAD(freelst, vnode);
98 struct vnode_index {
99 struct freelst active_list;
100 struct vnode active_rover;
101 struct freelst inactive_list;
102 struct spinlock spin;
103 int deac_rover;
104 int free_rover;
105 } __cachealign;
107 static struct vnode_index *vnode_list_hash;
109 int activevnodes = 0;
110 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
111 &activevnodes, 0, "Number of active nodes");
112 int cachedvnodes = 0;
113 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
114 &cachedvnodes, 0, "Number of total cached nodes");
115 int inactivevnodes = 0;
116 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
117 &inactivevnodes, 0, "Number of inactive nodes");
118 static int batchfreevnodes = 5;
119 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
120 &batchfreevnodes, 0, "Number of vnodes to free at once");
121 #ifdef TRACKVNODE
122 static u_long trackvnode;
123 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
124 &trackvnode, 0, "");
125 #endif
128 * Called from vfsinit()
130 void
131 vfs_lock_init(void)
133 int i;
135 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */
136 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
137 M_VNODE, M_ZERO | M_WAITOK);
138 for (i = 0; i < ncpus; ++i) {
139 struct vnode_index *vi = &vnode_list_hash[i];
141 TAILQ_INIT(&vi->inactive_list);
142 TAILQ_INIT(&vi->active_list);
143 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
144 spin_init(&vi->spin, "vfslock");
149 * Misc functions
151 static __inline
152 void
153 _vsetflags(struct vnode *vp, int flags)
155 atomic_set_int(&vp->v_flag, flags);
158 static __inline
159 void
160 _vclrflags(struct vnode *vp, int flags)
162 atomic_clear_int(&vp->v_flag, flags);
165 void
166 vsetflags(struct vnode *vp, int flags)
168 _vsetflags(vp, flags);
171 void
172 vclrflags(struct vnode *vp, int flags)
174 _vclrflags(vp, flags);
178 * Place the vnode on the active list.
180 * Caller must hold vp->v_spin
182 static __inline
183 void
184 _vactivate(struct vnode *vp)
186 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
188 #ifdef TRACKVNODE
189 if ((u_long)vp == trackvnode)
190 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
191 #endif
192 spin_lock(&vi->spin);
194 switch(vp->v_state) {
195 case VS_ACTIVE:
196 spin_unlock(&vi->spin);
197 panic("_vactivate: already active");
198 /* NOT REACHED */
199 return;
200 case VS_INACTIVE:
201 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
202 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
203 break;
204 case VS_CACHED:
205 case VS_DYING:
206 break;
208 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
209 vp->v_state = VS_ACTIVE;
210 spin_unlock(&vi->spin);
211 atomic_add_int(&mycpu->gd_activevnodes, 1);
215 * Put a vnode on the inactive list.
217 * Caller must hold v_spin
219 static __inline
220 void
221 _vinactive(struct vnode *vp)
223 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
225 #ifdef TRACKVNODE
226 if ((u_long)vp == trackvnode) {
227 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
228 print_backtrace(-1);
230 #endif
231 spin_lock(&vi->spin);
234 * Remove from active list if it is sitting on it
236 switch(vp->v_state) {
237 case VS_ACTIVE:
238 TAILQ_REMOVE(&vi->active_list, vp, v_list);
239 atomic_add_int(&mycpu->gd_activevnodes, -1);
240 break;
241 case VS_INACTIVE:
242 spin_unlock(&vi->spin);
243 panic("_vinactive: already inactive");
244 /* NOT REACHED */
245 return;
246 case VS_CACHED:
247 case VS_DYING:
248 break;
252 * Distinguish between basically dead vnodes, vnodes with cached
253 * data, and vnodes without cached data. A rover will shift the
254 * vnodes around as their cache status is lost.
256 if (vp->v_flag & VRECLAIMED) {
257 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
258 } else {
259 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
261 vp->v_state = VS_INACTIVE;
262 spin_unlock(&vi->spin);
263 atomic_add_int(&mycpu->gd_inactivevnodes, 1);
266 static __inline
267 void
268 _vinactive_tail(struct vnode *vp)
270 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
272 spin_lock(&vi->spin);
275 * Remove from active list if it is sitting on it
277 switch(vp->v_state) {
278 case VS_ACTIVE:
279 TAILQ_REMOVE(&vi->active_list, vp, v_list);
280 atomic_add_int(&mycpu->gd_activevnodes, -1);
281 break;
282 case VS_INACTIVE:
283 spin_unlock(&vi->spin);
284 panic("_vinactive_tail: already inactive");
285 /* NOT REACHED */
286 return;
287 case VS_CACHED:
288 case VS_DYING:
289 break;
292 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
293 vp->v_state = VS_INACTIVE;
294 spin_unlock(&vi->spin);
295 atomic_add_int(&mycpu->gd_inactivevnodes, 1);
299 * Add a ref to an active vnode. This function should never be called
300 * with an inactive vnode (use vget() instead), but might be called
301 * with other states.
303 void
304 vref(struct vnode *vp)
306 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
307 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
308 atomic_add_int(&vp->v_refcnt, 1);
311 void
312 synchronizevnodecount(void)
314 int nca = 0;
315 int act = 0;
316 int ina = 0;
317 int i;
319 for (i = 0; i < ncpus; ++i) {
320 globaldata_t gd = globaldata_find(i);
321 nca += gd->gd_cachedvnodes;
322 act += gd->gd_activevnodes;
323 ina += gd->gd_inactivevnodes;
325 cachedvnodes = nca;
326 activevnodes = act;
327 inactivevnodes = ina;
331 * Count number of cached vnodes. This is middling expensive so be
332 * careful not to make this call in the critical path. Each cpu tracks
333 * its own accumulator. The individual accumulators must be summed
334 * together to get an accurate value.
337 countcachedvnodes(void)
339 int i;
340 int n = 0;
342 for (i = 0; i < ncpus; ++i) {
343 globaldata_t gd = globaldata_find(i);
344 n += gd->gd_cachedvnodes;
346 return n;
350 countcachedandinactivevnodes(void)
352 int i;
353 int n = 0;
355 for (i = 0; i < ncpus; ++i) {
356 globaldata_t gd = globaldata_find(i);
357 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
359 return n;
363 * Release a ref on an active or inactive vnode.
365 * Caller has no other requirements.
367 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
368 * transition, otherwise we leave the vnode in the active list and
369 * do a lockless transition to 0, which is very important for the
370 * critical path.
372 * (vrele() is not called when a vnode is being destroyed w/kfree)
374 void
375 vrele(struct vnode *vp)
377 for (;;) {
378 int count = vp->v_refcnt;
379 cpu_ccfence();
380 KKASSERT((count & VREF_MASK) > 0);
381 KKASSERT(vp->v_state == VS_ACTIVE ||
382 vp->v_state == VS_INACTIVE);
385 * 2+ case
387 if ((count & VREF_MASK) > 1) {
388 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
389 break;
390 continue;
394 * 1->0 transition case must handle possible finalization.
395 * When finalizing we transition 1->0x40000000. Note that
396 * cachedvnodes is only adjusted on transitions to ->0.
398 * WARNING! VREF_TERMINATE can be cleared at any point
399 * when the refcnt is non-zero (by vget()) and
400 * the vnode has not been reclaimed. Thus
401 * transitions out of VREF_TERMINATE do not have
402 * to mess with cachedvnodes.
404 if (count & VREF_FINALIZE) {
405 vx_lock(vp);
406 if (atomic_cmpset_int(&vp->v_refcnt,
407 count, VREF_TERMINATE)) {
408 vnode_terminate(vp);
409 break;
411 vx_unlock(vp);
412 } else {
413 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
414 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
415 break;
418 /* retry */
423 * Add an auxiliary data structure reference to the vnode. Auxiliary
424 * references do not change the state of the vnode or prevent deactivation
425 * or reclamation of the vnode, but will prevent the vnode from being
426 * destroyed (kfree()'d).
428 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not
429 * already be held by the caller. vdrop() will clean up the
430 * free list state.
432 void
433 vhold(struct vnode *vp)
435 atomic_add_int(&vp->v_auxrefs, 1);
439 * Remove an auxiliary reference from the vnode.
441 void
442 vdrop(struct vnode *vp)
444 atomic_add_int(&vp->v_auxrefs, -1);
448 * This function is called on the 1->0 transition (which is actually
449 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
450 * of the vnode.
452 * Additional vrefs are allowed to race but will not result in a reentrant
453 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This
454 * prevents additional 1->0 transitions.
456 * ONLY A VGET() CAN REACTIVATE THE VNODE.
458 * Caller must hold the VX lock.
460 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
462 * NOTE: The vnode may be marked inactive with dirty buffers
463 * or dirty pages in its cached VM object still present.
465 * NOTE: VS_FREE should not be set on entry (the vnode was expected to
466 * previously be active). We lose control of the vnode the instant
467 * it is placed on the free list.
469 * The VX lock is required when transitioning to VS_CACHED but is
470 * not sufficient for the vshouldfree() interlocked test or when
471 * transitioning away from VS_CACHED. v_spin is also required for
472 * those cases.
474 static
475 void
476 vnode_terminate(struct vnode *vp)
478 KKASSERT(vp->v_state == VS_ACTIVE);
480 if ((vp->v_flag & VINACTIVE) == 0) {
481 _vsetflags(vp, VINACTIVE);
482 if (vp->v_mount)
483 VOP_INACTIVE(vp);
485 spin_lock(&vp->v_spin);
486 _vinactive(vp);
487 spin_unlock(&vp->v_spin);
489 vx_unlock(vp);
492 /****************************************************************
493 * VX LOCKING FUNCTIONS *
494 ****************************************************************
496 * These functions lock vnodes for reclamation and deactivation related
497 * activities. The caller must already be holding some sort of reference
498 * on the vnode.
500 void
501 vx_lock(struct vnode *vp)
503 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
506 void
507 vx_unlock(struct vnode *vp)
509 lockmgr(&vp->v_lock, LK_RELEASE);
512 /****************************************************************
513 * VNODE ACQUISITION FUNCTIONS *
514 ****************************************************************
516 * These functions must be used when accessing a vnode that has no
517 * chance of being destroyed in a SMP race. That means the caller will
518 * usually either hold an auxiliary reference (such as the namecache)
519 * or hold some other lock that ensures that the vnode cannot be destroyed.
521 * These functions are MANDATORY for any code chain accessing a vnode
522 * whos activation state is not known.
524 * vget() can be called with LK_NOWAIT and will return EBUSY if the
525 * lock cannot be immediately acquired.
527 * vget()/vput() are used when reactivation is desired.
529 * vx_get() and vx_put() are used when reactivation is not desired.
532 vget(struct vnode *vp, int flags)
534 int error;
537 * A lock type must be passed
539 if ((flags & LK_TYPE_MASK) == 0) {
540 panic("vget() called with no lock specified!");
541 /* NOT REACHED */
545 * Reference the structure and then acquire the lock.
547 * NOTE: The requested lock might be a shared lock and does
548 * not protect our access to the refcnt or other fields.
550 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
551 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
553 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
555 * The lock failed, undo and return an error. This will not
556 * normally trigger a termination.
558 vrele(vp);
559 } else if (vp->v_flag & VRECLAIMED) {
561 * The node is being reclaimed and cannot be reactivated
562 * any more, undo and return ENOENT.
564 vn_unlock(vp);
565 vrele(vp);
566 error = ENOENT;
567 } else if (vp->v_state == VS_ACTIVE) {
569 * A VS_ACTIVE vnode coupled with the fact that we have
570 * a vnode lock (even if shared) prevents v_state from
571 * changing. Since the vnode is not in a VRECLAIMED state,
572 * we can safely clear VINACTIVE.
574 * NOTE! Multiple threads may clear VINACTIVE if this is
575 * shared lock. This race is allowed.
577 _vclrflags(vp, VINACTIVE); /* SMP race ok */
578 vp->v_act += VACT_INC;
579 if (vp->v_act > VACT_MAX) /* SMP race ok */
580 vp->v_act = VACT_MAX;
581 error = 0;
582 } else {
584 * If the vnode is not VS_ACTIVE it must be reactivated
585 * in addition to clearing VINACTIVE. An exclusive spin_lock
586 * is needed to manipulate the vnode's list.
588 * Because the lockmgr lock might be shared, we might race
589 * another reactivation, which we handle. In this situation,
590 * however, the refcnt prevents other v_state races.
592 * As with above, clearing VINACTIVE is allowed to race other
593 * clearings of VINACTIVE.
595 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
596 * the refcnt is non-zero and the vnode has not been
597 * reclaimed. This also means that the transitions do
598 * not affect cachedvnodes.
600 _vclrflags(vp, VINACTIVE);
601 vp->v_act += VACT_INC;
602 if (vp->v_act > VACT_MAX) /* SMP race ok */
603 vp->v_act = VACT_MAX;
604 spin_lock(&vp->v_spin);
606 switch(vp->v_state) {
607 case VS_INACTIVE:
608 _vactivate(vp);
609 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
610 VREF_FINALIZE);
611 spin_unlock(&vp->v_spin);
612 break;
613 case VS_CACHED:
614 _vactivate(vp);
615 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
616 VREF_FINALIZE);
617 spin_unlock(&vp->v_spin);
618 break;
619 case VS_ACTIVE:
620 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE);
621 spin_unlock(&vp->v_spin);
622 break;
623 case VS_DYING:
624 spin_unlock(&vp->v_spin);
625 panic("Impossible VS_DYING state");
626 break;
628 error = 0;
630 return(error);
633 #ifdef DEBUG_VPUT
635 void
636 debug_vput(struct vnode *vp, const char *filename, int line)
638 kprintf("vput(%p) %s:%d\n", vp, filename, line);
639 vn_unlock(vp);
640 vrele(vp);
643 #else
645 void
646 vput(struct vnode *vp)
648 vn_unlock(vp);
649 vrele(vp);
652 #endif
655 * Acquire the vnode lock unguarded.
657 * The non-blocking version also uses a slightly different mechanic.
658 * This function will explicitly fail not only if it cannot acquire
659 * the lock normally, but also if the caller already holds a lock.
661 * The adjusted mechanic is used to close a loophole where complex
662 * VOP_RECLAIM code can circle around recursively and allocate the
663 * same vnode it is trying to destroy from the freelist.
665 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
666 * cause the incorrect behavior to occur. If not for that lockmgr()
667 * would do the right thing.
669 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
671 void
672 vx_get(struct vnode *vp)
674 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
675 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
676 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
680 vx_get_nonblock(struct vnode *vp)
682 int error;
684 if (lockinuse(&vp->v_lock))
685 return(EBUSY);
686 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
687 if (error == 0) {
688 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
689 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
691 return(error);
695 * Release a VX lock that also held a ref on the vnode. vrele() will handle
696 * any needed state transitions.
698 * However, filesystems use this function to get rid of unwanted new vnodes
699 * so try to get the vnode on the correct queue in that case.
701 void
702 vx_put(struct vnode *vp)
704 if (vp->v_type == VNON || vp->v_type == VBAD)
705 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
706 lockmgr(&vp->v_lock, LK_RELEASE);
707 vrele(vp);
711 * Try to reuse a vnode from the free list. This function is somewhat
712 * advisory in that NULL can be returned as a normal case, even if free
713 * vnodes are present.
715 * The scan is limited because it can result in excessive CPU use during
716 * periods of extreme vnode use.
718 * NOTE: The returned vnode is not completely initialized.
720 static
721 struct vnode *
722 cleanfreevnode(int maxcount)
724 struct vnode_index *vi;
725 struct vnode *vp;
726 int count;
727 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
728 int ri;
729 int cpu_count;
732 * Try to deactivate some vnodes cached on the active list.
734 if (countcachedvnodes() < inactivevnodes)
735 goto skip;
737 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
739 for (count = 0; count < maxcount * 2; ++count, ++ri) {
740 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
742 spin_lock(&vi->spin);
744 vp = TAILQ_NEXT(&vi->active_rover, v_list);
745 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
746 if (vp == NULL) {
747 TAILQ_INSERT_HEAD(&vi->active_list,
748 &vi->active_rover, v_list);
749 } else {
750 TAILQ_INSERT_AFTER(&vi->active_list, vp,
751 &vi->active_rover, v_list);
753 if (vp == NULL) {
754 spin_unlock(&vi->spin);
755 continue;
757 if ((vp->v_refcnt & VREF_MASK) != 0) {
758 spin_unlock(&vi->spin);
759 vp->v_act += VACT_INC;
760 if (vp->v_act > VACT_MAX) /* SMP race ok */
761 vp->v_act = VACT_MAX;
762 continue;
766 * decrement by less if the vnode's object has a lot of
767 * VM pages. XXX possible SMP races.
769 if (vp->v_act > 0) {
770 vm_object_t obj;
771 if ((obj = vp->v_object) != NULL &&
772 obj->resident_page_count >= trigger) {
773 vp->v_act -= 1;
774 } else {
775 vp->v_act -= VACT_INC;
777 if (vp->v_act < 0)
778 vp->v_act = 0;
779 spin_unlock(&vi->spin);
780 continue;
784 * Try to deactivate the vnode.
786 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
787 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
788 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
790 spin_unlock(&vi->spin);
791 vrele(vp);
794 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
796 skip:
798 * Loop trying to lock the first vnode on the free list.
799 * Cycle if we can't.
801 cpu_count = ncpus;
802 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
804 for (count = 0; count < maxcount; ++count, ++ri) {
805 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
807 spin_lock(&vi->spin);
809 vp = TAILQ_FIRST(&vi->inactive_list);
810 if (vp == NULL) {
811 spin_unlock(&vi->spin);
812 if (--cpu_count == 0)
813 break;
814 ri = (ri + 16) & ~15;
815 --ri;
816 continue;
820 * non-blocking vx_get will also ref the vnode on success.
822 if (vx_get_nonblock(vp)) {
823 KKASSERT(vp->v_state == VS_INACTIVE);
824 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
825 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
826 spin_unlock(&vi->spin);
827 continue;
831 * Because we are holding vfs_spin the vnode should currently
832 * be inactive and VREF_TERMINATE should still be set.
834 * Once vfs_spin is released the vnode's state should remain
835 * unmodified due to both the lock and ref on it.
837 KKASSERT(vp->v_state == VS_INACTIVE);
838 spin_unlock(&vi->spin);
839 #ifdef TRACKVNODE
840 if ((u_long)vp == trackvnode)
841 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
842 #endif
845 * Do not reclaim/reuse a vnode while auxillary refs exists.
846 * This includes namecache refs due to a related ncp being
847 * locked or having children, a VM object association, or
848 * other hold users.
850 * Do not reclaim/reuse a vnode if someone else has a real
851 * ref on it. This can occur if a filesystem temporarily
852 * releases the vnode lock during VOP_RECLAIM.
854 if (vp->v_auxrefs ||
855 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
856 failed:
857 if (vp->v_state == VS_INACTIVE) {
858 spin_lock(&vi->spin);
859 if (vp->v_state == VS_INACTIVE) {
860 TAILQ_REMOVE(&vi->inactive_list,
861 vp, v_list);
862 TAILQ_INSERT_TAIL(&vi->inactive_list,
863 vp, v_list);
865 spin_unlock(&vi->spin);
867 vx_put(vp);
868 continue;
872 * VINACTIVE and VREF_TERMINATE are expected to both be set
873 * for vnodes pulled from the inactive list, and cannot be
874 * changed while we hold the vx lock.
876 * Try to reclaim the vnode.
878 KKASSERT(vp->v_flag & VINACTIVE);
879 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
881 if ((vp->v_flag & VRECLAIMED) == 0) {
882 if (cache_inval_vp_nonblock(vp))
883 goto failed;
884 vgone_vxlocked(vp);
885 /* vnode is still VX locked */
889 * At this point if there are no other refs or auxrefs on
890 * the vnode with the inactive list locked, and we remove
891 * the vnode from the inactive list, it should not be
892 * possible for anyone else to access the vnode any more.
894 * Since the vnode is in a VRECLAIMED state, no new
895 * namecache associations could have been made and the
896 * vnode should have already been removed from its mountlist.
898 * Since we hold a VX lock on the vnode it cannot have been
899 * reactivated (moved out of the inactive list).
901 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
902 spin_lock(&vi->spin);
903 if (vp->v_auxrefs ||
904 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
905 spin_unlock(&vi->spin);
906 goto failed;
908 KKASSERT(vp->v_state == VS_INACTIVE);
909 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
910 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
911 vp->v_state = VS_DYING;
912 spin_unlock(&vi->spin);
915 * Nothing should have been able to access this vp. Only
916 * our ref should remain now.
918 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
919 KASSERT(vp->v_refcnt == 1,
920 ("vp %p badrefs %08x", vp, vp->v_refcnt));
923 * Return a VX locked vnode suitable for reuse.
925 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
926 return(vp);
928 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
929 return(NULL);
933 * Obtain a new vnode. The returned vnode is VX locked & vrefd.
935 * All new vnodes set the VAGE flags. An open() of the vnode will
936 * decrement the (2-bit) flags. Vnodes which are opened several times
937 * are thus retained in the cache over vnodes which are merely stat()d.
939 * We attempt to reuse an already-recycled vnode from our pcpu inactive
940 * queue first, and allocate otherwise. Attempting to recycle inactive
941 * vnodes here can lead to numerous deadlocks, particularly with
942 * softupdates.
944 struct vnode *
945 allocvnode(int lktimeout, int lkflags)
947 struct vnode *vp;
948 struct vnode_index *vi;
951 * lktimeout only applies when LK_TIMELOCK is used, and only
952 * the pageout daemon uses it. The timeout may not be zero
953 * or the pageout daemon can deadlock in low-VM situations.
955 if (lktimeout == 0)
956 lktimeout = hz / 10;
959 * Do not flag for synchronous recyclement unless there are enough
960 * freeable vnodes to recycle and the number of vnodes has
961 * significantly exceeded our target. We want the normal vnlru
962 * process to handle the cleaning (at 9/10's) before we are forced
963 * to flag it here at 11/10's for userexit path processing.
965 if (numvnodes >= maxvnodes * 11 / 10 &&
966 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
967 struct thread *td = curthread;
968 if (td->td_lwp)
969 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
973 * Try to trivially reuse a reclaimed vnode from the head of the
974 * inactive list for this cpu. Any vnode cycling which occurs
975 * which terminates the vnode will cause it to be returned to the
976 * same pcpu structure (e.g. unlink calls).
978 vi = &vnode_list_hash[mycpuid];
979 spin_lock(&vi->spin);
981 vp = TAILQ_FIRST(&vi->inactive_list);
982 if (vp && (vp->v_flag & VRECLAIMED)) {
984 * non-blocking vx_get will also ref the vnode on success.
986 if (vx_get_nonblock(vp)) {
987 KKASSERT(vp->v_state == VS_INACTIVE);
988 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
989 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
990 spin_unlock(&vi->spin);
991 goto slower;
995 * Because we are holding vfs_spin the vnode should currently
996 * be inactive and VREF_TERMINATE should still be set.
998 * Once vfs_spin is released the vnode's state should remain
999 * unmodified due to both the lock and ref on it.
1001 KKASSERT(vp->v_state == VS_INACTIVE);
1002 #ifdef TRACKVNODE
1003 if ((u_long)vp == trackvnode)
1004 kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1005 #endif
1008 * Do not reclaim/reuse a vnode while auxillary refs exists.
1009 * This includes namecache refs due to a related ncp being
1010 * locked or having children, a VM object association, or
1011 * other hold users.
1013 * Do not reclaim/reuse a vnode if someone else has a real
1014 * ref on it. This can occur if a filesystem temporarily
1015 * releases the vnode lock during VOP_RECLAIM.
1017 if (vp->v_auxrefs ||
1018 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1019 if (vp->v_state == VS_INACTIVE) {
1020 if (vp->v_state == VS_INACTIVE) {
1021 TAILQ_REMOVE(&vi->inactive_list,
1022 vp, v_list);
1023 TAILQ_INSERT_TAIL(&vi->inactive_list,
1024 vp, v_list);
1027 spin_unlock(&vi->spin);
1028 vx_put(vp);
1029 goto slower;
1033 * VINACTIVE and VREF_TERMINATE are expected to both be set
1034 * for vnodes pulled from the inactive list, and cannot be
1035 * changed while we hold the vx lock.
1037 * Try to reclaim the vnode.
1039 KKASSERT(vp->v_flag & VINACTIVE);
1040 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1042 if ((vp->v_flag & VRECLAIMED) == 0) {
1043 spin_unlock(&vi->spin);
1044 vx_put(vp);
1045 goto slower;
1049 * At this point if there are no other refs or auxrefs on
1050 * the vnode with the inactive list locked, and we remove
1051 * the vnode from the inactive list, it should not be
1052 * possible for anyone else to access the vnode any more.
1054 * Since the vnode is in a VRECLAIMED state, no new
1055 * namecache associations could have been made and the
1056 * vnode should have already been removed from its mountlist.
1058 * Since we hold a VX lock on the vnode it cannot have been
1059 * reactivated (moved out of the inactive list).
1061 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1062 KKASSERT(vp->v_state == VS_INACTIVE);
1063 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1064 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1065 vp->v_state = VS_DYING;
1066 spin_unlock(&vi->spin);
1069 * Nothing should have been able to access this vp. Only
1070 * our ref should remain now.
1072 * At this point we can kfree() the vnode if we want to.
1073 * Instead, we reuse it for the allocation.
1075 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1076 KASSERT(vp->v_refcnt == 1,
1077 ("vp %p badrefs %08x", vp, vp->v_refcnt));
1078 bzero(vp, sizeof(*vp));
1079 } else {
1080 spin_unlock(&vi->spin);
1081 slower:
1082 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1083 atomic_add_int(&numvnodes, 1);
1086 lwkt_token_init(&vp->v_token, "vnode");
1087 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1088 TAILQ_INIT(&vp->v_namecache);
1089 RB_INIT(&vp->v_rbclean_tree);
1090 RB_INIT(&vp->v_rbdirty_tree);
1091 RB_INIT(&vp->v_rbhash_tree);
1092 spin_init(&vp->v_spin, "allocvnode");
1094 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
1095 vp->v_refcnt = 1;
1096 vp->v_flag = VAGE0 | VAGE1;
1097 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1099 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1100 /* exclusive lock still held */
1102 vp->v_filesize = NOOFFSET;
1103 vp->v_type = VNON;
1104 vp->v_tag = 0;
1105 vp->v_state = VS_CACHED;
1106 _vactivate(vp);
1108 return (vp);
1112 * Called after a process has allocated a vnode via allocvnode()
1113 * and we detected that too many vnodes were present.
1115 * This function is called just prior to a return to userland if the
1116 * process at some point had to allocate a new vnode during the last
1117 * system call and the vnode count was found to be excessive.
1119 * This is a synchronous path that we do not normally want to execute.
1121 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1123 * WARNING: Sometimes numvnodes can blow out due to children being
1124 * present under directory vnodes in the namecache. For the
1125 * moment use an if() instead of a while() and note that if
1126 * we were to use a while() we would still have to break out
1127 * if freesomevnodes() returned 0. vnlru will also be trying
1128 * hard to free vnodes at the same time (with a lower trigger
1129 * pointer).
1131 void
1132 allocvnode_gc(void)
1134 if (numvnodes >= maxvnodes &&
1135 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1136 freesomevnodes(batchfreevnodes);
1141 freesomevnodes(int n)
1143 struct vnode *vp;
1144 int count = 0;
1146 while (n) {
1147 if ((vp = cleanfreevnode(n)) == NULL)
1148 break;
1149 vx_unlock(vp);
1150 --n;
1151 ++count;
1152 kfree(vp, M_VNODE);
1153 atomic_add_int(&numvnodes, -1);
1155 return(count);