uts: make emu10k non-verbose
[unleashed.git] / kernel / fs / dnlc.c
blobbb6631cfc04ecb1e4307b06beafb9dab492bd7fa
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
38 #include <sys/types.h>
39 #include <sys/systm.h>
40 #include <sys/param.h>
41 #include <sys/t_lock.h>
42 #include <sys/systm.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/dnlc.h>
46 #include <sys/kmem.h>
47 #include <sys/cmn_err.h>
48 #include <sys/vtrace.h>
49 #include <sys/bitmap.h>
50 #include <sys/var.h>
51 #include <sys/sysmacros.h>
52 #include <sys/kstat.h>
53 #include <sys/atomic.h>
54 #include <sys/taskq.h>
57 * Directory name lookup cache.
58 * Based on code originally done by Robert Elz at Melbourne.
60 * Names found by directory scans are retained in a cache
61 * for future reference. Each hash chain is ordered by LRU
62 * Cache is indexed by hash value obtained from (vp, name)
63 * where the vp refers to the directory containing the name.
67 * We want to be able to identify files that are referenced only by the DNLC.
68 * When adding a reference from the DNLC, call VN_HOLD_DNLC instead of VN_HOLD,
69 * since multiple DNLC references should only be counted once in v_count. This
70 * file contains only two(2) calls to VN_HOLD, renamed VN_HOLD_CALLER in the
71 * hope that no one will mistakenly add a VN_HOLD to this file. (Unfortunately
72 * it is not possible to #undef VN_HOLD and retain VN_HOLD_CALLER. Ideally a
73 * Makefile rule would grep uncommented C tokens to check that VN_HOLD is
74 * referenced only once in this file, to define VN_HOLD_CALLER.)
76 #define VN_HOLD_CALLER VN_HOLD
77 #define VN_HOLD_DNLC(vp) { \
78 mutex_enter(&(vp)->v_lock); \
79 if ((vp)->v_count_dnlc == 0) \
80 (vp)->v_count++; \
81 (vp)->v_count_dnlc++; \
82 mutex_exit(&(vp)->v_lock); \
84 #define VN_RELE_DNLC(vp) { \
85 vn_rele_dnlc(vp); \
89 * Tunable nc_hashavelen is the average length desired for this chain, from
90 * which the size of the nc_hash table is derived at create time.
92 #define NC_HASHAVELEN_DEFAULT 4
93 int nc_hashavelen = NC_HASHAVELEN_DEFAULT;
96 * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup
97 * depth exceeds this value, we move the looked-up entry to the front of
98 * its hash chain. The idea is to make sure that the most frequently
99 * accessed entries are found most quickly (by keeping them near the
100 * front of their hash chains).
102 #define NC_MOVETOFRONT 2
106 * DNLC_MAX_RELE is used to size an array on the stack when releasing
107 * vnodes. This array is used rather than calling VN_RELE() inline because
108 * all dnlc locks must be dropped by that time in order to avoid a
109 * possible deadlock. This deadlock occurs when the dnlc holds the last
110 * reference to the vnode and so the fop_inactive vector is called which
111 * can in turn call back into the dnlc. A global array was used but had
112 * many problems:
113 * 1) Actually doesn't have an upper bound on the array size as
114 * entries can be added after starting the purge.
115 * 2) The locking scheme causes a hang.
116 * 3) Caused serialisation on the global lock.
117 * 4) The array was often unnecessarily huge.
119 * Note the current value 8 allows up to 4 cache entries (to be purged
120 * from each hash chain), before having to cycle around and retry.
121 * This ought to be ample given that nc_hashavelen is typically very small.
123 #define DNLC_MAX_RELE 8 /* must be even */
126 * Hash table of name cache entries for fast lookup, dynamically
127 * allocated at startup.
129 nc_hash_t *nc_hash;
132 * Rotors. Used to select entries on a round-robin basis.
134 static nc_hash_t *dnlc_purge_fs1_rotor;
135 static nc_hash_t *dnlc_free_rotor;
138 * # of dnlc entries (uninitialized)
140 * the initial value was chosen as being
141 * a random string of bits, probably not
142 * normally chosen by a systems administrator
144 int ncsize = -1;
145 volatile uint32_t dnlc_nentries = 0; /* current num of name cache entries */
146 static int nc_hashsz; /* size of hash table */
147 static int nc_hashmask; /* size of hash table minus 1 */
150 * The dnlc_reduce_cache() taskq queue is activated when there are
151 * ncsize name cache entries and if no parameter is provided, it reduces
152 * the size down to dnlc_nentries_low_water, which is by default one
153 * hundreth less (or 99%) of ncsize.
155 * If a parameter is provided to dnlc_reduce_cache(), then we reduce
156 * the size down based on ncsize_onepercent - where ncsize_onepercent
157 * is 1% of ncsize; however, we never let dnlc_reduce_cache() reduce
158 * the size below 3% of ncsize (ncsize_min_percent).
160 #define DNLC_LOW_WATER_DIVISOR_DEFAULT 100
161 uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT;
162 uint_t dnlc_nentries_low_water;
163 int dnlc_reduce_idle = 1; /* no locking needed */
164 uint_t ncsize_onepercent;
165 uint_t ncsize_min_percent;
168 * If dnlc_nentries hits dnlc_max_nentries (twice ncsize)
169 * then this means the dnlc_reduce_cache() taskq is failing to
170 * keep up. In this case we refuse to add new entries to the dnlc
171 * until the taskq catches up.
173 uint_t dnlc_max_nentries; /* twice ncsize */
174 uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */
177 * Tunable to define when we should just remove items from
178 * the end of the chain.
180 #define DNLC_LONG_CHAIN 8
181 uint_t dnlc_long_chain = DNLC_LONG_CHAIN;
184 * ncstats has been deprecated, due to the integer size of the counters
185 * which can easily overflow in the dnlc.
186 * It is maintained (at some expense) for compatability.
187 * The preferred interface is the kstat accessible nc_stats below.
189 struct ncstats ncstats;
191 struct nc_stats ncs = {
192 { "hits", KSTAT_DATA_UINT64 },
193 { "misses", KSTAT_DATA_UINT64 },
194 { "negative_cache_hits", KSTAT_DATA_UINT64 },
195 { "enters", KSTAT_DATA_UINT64 },
196 { "double_enters", KSTAT_DATA_UINT64 },
197 { "purge_total_entries", KSTAT_DATA_UINT64 },
198 { "purge_all", KSTAT_DATA_UINT64 },
199 { "purge_vp", KSTAT_DATA_UINT64 },
200 { "purge_vfs", KSTAT_DATA_UINT64 },
201 { "purge_fs1", KSTAT_DATA_UINT64 },
202 { "pick_free", KSTAT_DATA_UINT64 },
203 { "pick_heuristic", KSTAT_DATA_UINT64 },
204 { "pick_last", KSTAT_DATA_UINT64 },
206 /* directory caching stats */
208 { "dir_hits", KSTAT_DATA_UINT64 },
209 { "dir_misses", KSTAT_DATA_UINT64 },
210 { "dir_cached_current", KSTAT_DATA_UINT64 },
211 { "dir_entries_cached_current", KSTAT_DATA_UINT64 },
212 { "dir_cached_total", KSTAT_DATA_UINT64 },
213 { "dir_start_no_memory", KSTAT_DATA_UINT64 },
214 { "dir_add_no_memory", KSTAT_DATA_UINT64 },
215 { "dir_add_abort", KSTAT_DATA_UINT64 },
216 { "dir_add_max", KSTAT_DATA_UINT64 },
217 { "dir_remove_entry_fail", KSTAT_DATA_UINT64 },
218 { "dir_remove_space_fail", KSTAT_DATA_UINT64 },
219 { "dir_update_fail", KSTAT_DATA_UINT64 },
220 { "dir_fini_purge", KSTAT_DATA_UINT64 },
221 { "dir_reclaim_last", KSTAT_DATA_UINT64 },
222 { "dir_reclaim_any", KSTAT_DATA_UINT64 },
225 static int doingcache = 1;
227 vnode_t negative_cache_vnode;
230 * Insert entry at the front of the queue
232 #define nc_inshash(ncp, hp) \
234 (ncp)->hash_next = (hp)->hash_next; \
235 (ncp)->hash_prev = (ncache_t *)(hp); \
236 (hp)->hash_next->hash_prev = (ncp); \
237 (hp)->hash_next = (ncp); \
241 * Remove entry from hash queue
243 #define nc_rmhash(ncp) \
245 (ncp)->hash_prev->hash_next = (ncp)->hash_next; \
246 (ncp)->hash_next->hash_prev = (ncp)->hash_prev; \
247 (ncp)->hash_prev = NULL; \
248 (ncp)->hash_next = NULL; \
252 * Free an entry.
254 #define dnlc_free(ncp) \
256 kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \
257 atomic_dec_32(&dnlc_nentries); \
262 * Cached directory info.
263 * ======================
267 * Cached directory free space hash function.
268 * Needs the free space handle and the dcp to get the hash table size
269 * Returns the hash index.
271 #define DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask)
274 * Cached directory name entry hash function.
275 * Uses the name and returns in the input arguments the hash and the name
276 * length.
278 #define DNLC_DIR_HASH(name, hash, namelen) \
280 char Xc; \
281 const char *Xcp; \
282 hash = *name; \
283 for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \
284 hash = (hash << 4) + hash + Xc; \
285 ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \
286 namelen = Xcp - (name); \
289 /* special dircache_t pointer to indicate error should be returned */
291 * The anchor directory cache pointer can contain 3 types of values,
292 * 1) NULL: No directory cache
293 * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be
294 * too big or a memory shortage occurred. This value remains in the
295 * pointer until a dnlc_dir_start() which returns the a DNOMEM error.
296 * This is kludgy but efficient and only visible in this source file.
297 * 3) A valid cache pointer.
299 #define DC_RET_LOW_MEM (dircache_t *)1
300 #define VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM)
302 /* Tunables */
303 uint_t dnlc_dir_enable = 1; /* disable caching directories by setting to 0 */
304 uint_t dnlc_dir_min_size = 40; /* min no of directory entries before caching */
305 uint_t dnlc_dir_max_size = UINT_MAX; /* ditto maximum */
306 uint_t dnlc_dir_hash_size_shift = 3; /* 8 entries per hash bucket */
307 uint_t dnlc_dir_min_reclaim = 350000; /* approx 1MB of dcentrys */
309 * dnlc_dir_hash_resize_shift determines when the hash tables
310 * get re-adjusted due to growth or shrinkage
311 * - currently 2 indicating that there can be at most 4
312 * times or at least one quarter the number of entries
313 * before hash table readjustment. Note that with
314 * dnlc_dir_hash_size_shift above set at 3 this would
315 * mean readjustment would occur if the average number
316 * of entries went above 32 or below 2
318 uint_t dnlc_dir_hash_resize_shift = 2; /* readjust rate */
320 static kmem_cache_t *dnlc_dir_space_cache; /* free space entry cache */
321 static dchead_t dc_head; /* anchor of cached directories */
323 /* Prototypes */
324 static ncache_t *dnlc_get(uchar_t namlen);
325 static ncache_t *dnlc_search(vnode_t *dp, const char *name, uchar_t namlen,
326 int hash);
327 static void dnlc_dir_reclaim(void *unused);
328 static void dnlc_dir_abort(dircache_t *dcp);
329 static void dnlc_dir_adjust_fhash(dircache_t *dcp);
330 static void dnlc_dir_adjust_nhash(dircache_t *dcp);
331 static void do_dnlc_reduce_cache(void *);
335 * Initialize the directory cache.
337 void
338 dnlc_init()
340 nc_hash_t *hp;
341 kstat_t *ksp;
342 int i;
345 * Set up the size of the dnlc (ncsize) and its low water mark.
347 if (ncsize == -1) {
348 /* calculate a reasonable size for the low water */
349 dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320;
350 ncsize = dnlc_nentries_low_water +
351 (dnlc_nentries_low_water / dnlc_low_water_divisor);
352 } else {
353 /* don't change the user specified ncsize */
354 dnlc_nentries_low_water =
355 ncsize - (ncsize / dnlc_low_water_divisor);
357 if (ncsize <= 0) {
358 doingcache = 0;
359 dnlc_dir_enable = 0; /* also disable directory caching */
360 ncsize = 0;
361 cmn_err(CE_NOTE, "name cache (dnlc) disabled");
362 return;
364 dnlc_max_nentries = ncsize * 2;
365 ncsize_onepercent = ncsize / 100;
366 ncsize_min_percent = ncsize_onepercent * 3;
369 * Initialise the hash table.
370 * Compute hash size rounding to the next power of two.
372 nc_hashsz = ncsize / nc_hashavelen;
373 nc_hashsz = 1 << highbit(nc_hashsz);
374 nc_hashmask = nc_hashsz - 1;
375 nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP);
376 for (i = 0; i < nc_hashsz; i++) {
377 hp = (nc_hash_t *)&nc_hash[i];
378 mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL);
379 hp->hash_next = (ncache_t *)hp;
380 hp->hash_prev = (ncache_t *)hp;
384 * Initialize rotors
386 dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0];
389 * Set up the directory caching to use kmem_cache_alloc
390 * for its free space entries so that we can get a callback
391 * when the system is short on memory, to allow us to free
392 * up some memory. we don't use the constructor/deconstructor
393 * functions.
395 dnlc_dir_space_cache = kmem_cache_create("dnlc_space_cache",
396 sizeof (dcfree_t), 0, NULL, NULL, dnlc_dir_reclaim, NULL,
397 NULL, 0);
400 * Initialise the head of the cached directory structures
402 mutex_init(&dc_head.dch_lock, NULL, MUTEX_DEFAULT, NULL);
403 dc_head.dch_next = (dircache_t *)&dc_head;
404 dc_head.dch_prev = (dircache_t *)&dc_head;
407 * Initialise the reference count of the negative cache vnode to 1
408 * so that it never goes away (fop_inactive isn't called on it).
410 negative_cache_vnode.v_count = 1;
411 negative_cache_vnode.v_count_dnlc = 0;
414 * Initialise kstats - both the old compatability raw kind and
415 * the more extensive named stats.
417 ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW,
418 sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL);
419 if (ksp) {
420 ksp->ks_data = (void *) &ncstats;
421 kstat_install(ksp);
423 ksp = kstat_create("unix", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED,
424 sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
425 if (ksp) {
426 ksp->ks_data = (void *) &ncs;
427 kstat_install(ksp);
432 * Add a name to the directory cache.
434 void
435 dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp)
437 ncache_t *ncp;
438 nc_hash_t *hp;
439 uchar_t namlen;
440 int hash;
442 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_enter_start:");
444 if (!doingcache) {
445 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
446 "dnlc_enter_end:(%S) %d", "not caching", 0);
447 return;
451 * Get a new dnlc entry. Assume the entry won't be in the cache
452 * and initialize it now
454 DNLCHASH(name, dp, hash, namlen);
455 if ((ncp = dnlc_get(namlen)) == NULL)
456 return;
457 ncp->dp = dp;
458 VN_HOLD_DNLC(dp);
459 ncp->vp = vp;
460 VN_HOLD_DNLC(vp);
461 bcopy(name, ncp->name, namlen + 1); /* name and null */
462 ncp->hash = hash;
463 hp = &nc_hash[hash & nc_hashmask];
465 mutex_enter(&hp->hash_lock);
466 if (dnlc_search(dp, name, namlen, hash) != NULL) {
467 mutex_exit(&hp->hash_lock);
468 ncstats.dbl_enters++;
469 ncs.ncs_dbl_enters.value.ui64++;
470 VN_RELE_DNLC(dp);
471 VN_RELE_DNLC(vp);
472 dnlc_free(ncp); /* crfree done here */
473 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
474 "dnlc_enter_end:(%S) %d", "dbl enter", ncstats.dbl_enters);
475 return;
478 * Insert back into the hash chain.
480 nc_inshash(ncp, hp);
481 mutex_exit(&hp->hash_lock);
482 ncstats.enters++;
483 ncs.ncs_enters.value.ui64++;
484 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
485 "dnlc_enter_end:(%S) %d", "done", ncstats.enters);
489 * Add a name to the directory cache.
491 * This function is basically identical with
492 * dnlc_enter(). The difference is that when the
493 * desired dnlc entry is found, the vnode in the
494 * ncache is compared with the vnode passed in.
496 * If they are not equal then the ncache is
497 * updated with the passed in vnode. Otherwise
498 * it just frees up the newly allocated dnlc entry.
500 void
501 dnlc_update(vnode_t *dp, const char *name, vnode_t *vp)
503 ncache_t *ncp;
504 ncache_t *tcp;
505 vnode_t *tvp;
506 nc_hash_t *hp;
507 int hash;
508 uchar_t namlen;
510 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:");
512 if (!doingcache) {
513 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
514 "dnlc_update_end:(%S) %d", "not caching", 0);
515 return;
519 * Get a new dnlc entry and initialize it now.
520 * If we fail to get a new entry, call dnlc_remove() to purge
521 * any existing dnlc entry including negative cache (DNLC_NO_VNODE)
522 * entry.
523 * Failure to clear an existing entry could result in false dnlc
524 * lookup (negative/stale entry).
526 DNLCHASH(name, dp, hash, namlen);
527 if ((ncp = dnlc_get(namlen)) == NULL) {
528 dnlc_remove(dp, name);
529 return;
531 ncp->dp = dp;
532 VN_HOLD_DNLC(dp);
533 ncp->vp = vp;
534 VN_HOLD_DNLC(vp);
535 bcopy(name, ncp->name, namlen + 1); /* name and null */
536 ncp->hash = hash;
537 hp = &nc_hash[hash & nc_hashmask];
539 mutex_enter(&hp->hash_lock);
540 if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) {
541 if (tcp->vp != vp) {
542 tvp = tcp->vp;
543 tcp->vp = vp;
544 mutex_exit(&hp->hash_lock);
545 VN_RELE_DNLC(tvp);
546 ncstats.enters++;
547 ncs.ncs_enters.value.ui64++;
548 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
549 "dnlc_update_end:(%S) %d", "done", ncstats.enters);
550 } else {
551 mutex_exit(&hp->hash_lock);
552 VN_RELE_DNLC(vp);
553 ncstats.dbl_enters++;
554 ncs.ncs_dbl_enters.value.ui64++;
555 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
556 "dnlc_update_end:(%S) %d",
557 "dbl enter", ncstats.dbl_enters);
559 VN_RELE_DNLC(dp);
560 dnlc_free(ncp); /* crfree done here */
561 return;
564 * insert the new entry, since it is not in dnlc yet
566 nc_inshash(ncp, hp);
567 mutex_exit(&hp->hash_lock);
568 ncstats.enters++;
569 ncs.ncs_enters.value.ui64++;
570 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
571 "dnlc_update_end:(%S) %d", "done", ncstats.enters);
575 * Look up a name in the directory name cache.
577 * Return a doubly-held vnode if found: one hold so that it may
578 * remain in the cache for other users, the other hold so that
579 * the cache is not re-cycled and the identity of the vnode is
580 * lost before the caller can use the vnode.
582 vnode_t *
583 dnlc_lookup(vnode_t *dp, const char *name)
585 ncache_t *ncp;
586 nc_hash_t *hp;
587 vnode_t *vp;
588 int hash, depth;
589 uchar_t namlen;
591 TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START,
592 "dnlc_lookup_start:dp %x name %s", dp, name);
594 if (!doingcache) {
595 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
596 "dnlc_lookup_end:%S %d vp %x name %s",
597 "not_caching", 0, NULL, name);
598 return (NULL);
601 DNLCHASH(name, dp, hash, namlen);
602 depth = 1;
603 hp = &nc_hash[hash & nc_hashmask];
604 mutex_enter(&hp->hash_lock);
606 for (ncp = hp->hash_next; ncp != (ncache_t *)hp;
607 ncp = ncp->hash_next) {
608 if (ncp->hash == hash && /* fast signature check */
609 ncp->dp == dp &&
610 ncp->namlen == namlen &&
611 bcmp(ncp->name, name, namlen) == 0) {
613 * Move this entry to the head of its hash chain
614 * if it's not already close.
616 if (depth > NC_MOVETOFRONT) {
617 ncache_t *next = ncp->hash_next;
618 ncache_t *prev = ncp->hash_prev;
620 prev->hash_next = next;
621 next->hash_prev = prev;
622 ncp->hash_next = next = hp->hash_next;
623 ncp->hash_prev = (ncache_t *)hp;
624 next->hash_prev = ncp;
625 hp->hash_next = ncp;
627 ncstats.move_to_front++;
631 * Put a hold on the vnode now so its identity
632 * can't change before the caller has a chance to
633 * put a hold on it.
635 vp = ncp->vp;
636 VN_HOLD_CALLER(vp); /* VN_HOLD 1 of 2 in this file */
637 mutex_exit(&hp->hash_lock);
638 ncstats.hits++;
639 ncs.ncs_hits.value.ui64++;
640 if (vp == DNLC_NO_VNODE) {
641 ncs.ncs_neg_hits.value.ui64++;
643 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
644 "dnlc_lookup_end:%S %d vp %x name %s", "hit",
645 ncstats.hits, vp, name);
646 return (vp);
648 depth++;
651 mutex_exit(&hp->hash_lock);
652 ncstats.misses++;
653 ncs.ncs_misses.value.ui64++;
654 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
655 "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses,
656 NULL, name);
657 return (NULL);
661 * Remove an entry in the directory name cache.
663 void
664 dnlc_remove(vnode_t *dp, const char *name)
666 ncache_t *ncp;
667 nc_hash_t *hp;
668 uchar_t namlen;
669 int hash;
671 if (!doingcache)
672 return;
673 DNLCHASH(name, dp, hash, namlen);
674 hp = &nc_hash[hash & nc_hashmask];
676 mutex_enter(&hp->hash_lock);
677 if (ncp = dnlc_search(dp, name, namlen, hash)) {
679 * Free up the entry
681 nc_rmhash(ncp);
682 mutex_exit(&hp->hash_lock);
683 VN_RELE_DNLC(ncp->vp);
684 VN_RELE_DNLC(ncp->dp);
685 dnlc_free(ncp);
686 return;
688 mutex_exit(&hp->hash_lock);
692 * Purge the entire cache.
694 void
695 dnlc_purge()
697 nc_hash_t *nch;
698 ncache_t *ncp;
699 int index;
700 int i;
701 vnode_t *nc_rele[DNLC_MAX_RELE];
703 if (!doingcache)
704 return;
706 ncstats.purges++;
707 ncs.ncs_purge_all.value.ui64++;
709 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
710 index = 0;
711 mutex_enter(&nch->hash_lock);
712 ncp = nch->hash_next;
713 while (ncp != (ncache_t *)nch) {
714 ncache_t *np;
716 np = ncp->hash_next;
717 nc_rele[index++] = ncp->vp;
718 nc_rele[index++] = ncp->dp;
720 nc_rmhash(ncp);
721 dnlc_free(ncp);
722 ncp = np;
723 ncs.ncs_purge_total.value.ui64++;
724 if (index == DNLC_MAX_RELE)
725 break;
727 mutex_exit(&nch->hash_lock);
729 /* Release holds on all the vnodes now that we have no locks */
730 for (i = 0; i < index; i++) {
731 VN_RELE_DNLC(nc_rele[i]);
733 if (ncp != (ncache_t *)nch) {
734 nch--; /* Do current hash chain again */
740 * Purge any cache entries referencing a vnode. Exit as soon as the dnlc
741 * reference count goes to zero (the caller still holds a reference).
743 void
744 dnlc_purge_vp(vnode_t *vp)
746 nc_hash_t *nch;
747 ncache_t *ncp;
748 int index;
749 vnode_t *nc_rele[DNLC_MAX_RELE];
751 ASSERT(vp->v_count > 0);
752 if (vp->v_count_dnlc == 0) {
753 return;
756 if (!doingcache)
757 return;
759 ncstats.purges++;
760 ncs.ncs_purge_vp.value.ui64++;
762 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
763 index = 0;
764 mutex_enter(&nch->hash_lock);
765 ncp = nch->hash_next;
766 while (ncp != (ncache_t *)nch) {
767 ncache_t *np;
769 np = ncp->hash_next;
770 if (ncp->dp == vp || ncp->vp == vp) {
771 nc_rele[index++] = ncp->vp;
772 nc_rele[index++] = ncp->dp;
773 nc_rmhash(ncp);
774 dnlc_free(ncp);
775 ncs.ncs_purge_total.value.ui64++;
776 if (index == DNLC_MAX_RELE) {
777 ncp = np;
778 break;
781 ncp = np;
783 mutex_exit(&nch->hash_lock);
785 /* Release holds on all the vnodes now that we have no locks */
786 while (index) {
787 VN_RELE_DNLC(nc_rele[--index]);
790 if (vp->v_count_dnlc == 0) {
791 return;
794 if (ncp != (ncache_t *)nch) {
795 nch--; /* Do current hash chain again */
801 * Purge cache entries referencing a vfsp. Caller supplies a count
802 * of entries to purge; up to that many will be freed. A count of
803 * zero indicates that all such entries should be purged. Returns
804 * the number of entries that were purged.
807 dnlc_purge_vfsp(vfs_t *vfsp, int count)
809 nc_hash_t *nch;
810 ncache_t *ncp;
811 int n = 0;
812 int index;
813 int i;
814 vnode_t *nc_rele[DNLC_MAX_RELE];
816 if (!doingcache)
817 return (0);
819 ncstats.purges++;
820 ncs.ncs_purge_vfs.value.ui64++;
822 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
823 index = 0;
824 mutex_enter(&nch->hash_lock);
825 ncp = nch->hash_next;
826 while (ncp != (ncache_t *)nch) {
827 ncache_t *np;
829 np = ncp->hash_next;
830 ASSERT(ncp->dp != NULL);
831 ASSERT(ncp->vp != NULL);
832 if ((ncp->dp->v_vfsp == vfsp) ||
833 (ncp->vp->v_vfsp == vfsp)) {
834 n++;
835 nc_rele[index++] = ncp->vp;
836 nc_rele[index++] = ncp->dp;
837 nc_rmhash(ncp);
838 dnlc_free(ncp);
839 ncs.ncs_purge_total.value.ui64++;
840 if (index == DNLC_MAX_RELE) {
841 ncp = np;
842 break;
844 if (count != 0 && n >= count) {
845 break;
848 ncp = np;
850 mutex_exit(&nch->hash_lock);
851 /* Release holds on all the vnodes now that we have no locks */
852 for (i = 0; i < index; i++) {
853 VN_RELE_DNLC(nc_rele[i]);
855 if (count != 0 && n >= count) {
856 return (n);
858 if (ncp != (ncache_t *)nch) {
859 nch--; /* Do current hash chain again */
862 return (n);
866 * Purge 1 entry from the dnlc that is part of the filesystem(s)
867 * represented by 'vop'. The purpose of this routine is to allow
868 * users of the dnlc to free a vnode that is being held by the dnlc.
870 * If we find a vnode that we release which will result in
871 * freeing the underlying vnode (count was 1), return 1, 0
872 * if no appropriate vnodes found.
874 * Note, vop is not the 'right' identifier for a filesystem.
877 dnlc_fs_purge1(vnodeops_t *vop)
879 nc_hash_t *end;
880 nc_hash_t *hp;
881 ncache_t *ncp;
882 vnode_t *vp;
884 if (!doingcache)
885 return (0);
887 ncs.ncs_purge_fs1.value.ui64++;
890 * Scan the dnlc entries looking for a likely candidate.
892 hp = end = dnlc_purge_fs1_rotor;
894 do {
895 if (++hp == &nc_hash[nc_hashsz])
896 hp = nc_hash;
897 dnlc_purge_fs1_rotor = hp;
898 if (hp->hash_next == (ncache_t *)hp)
899 continue;
900 mutex_enter(&hp->hash_lock);
901 for (ncp = hp->hash_prev;
902 ncp != (ncache_t *)hp;
903 ncp = ncp->hash_prev) {
904 vp = ncp->vp;
905 if (!vn_has_cached_data(vp) && (vp->v_count == 1) &&
906 vn_matchops(vp, vop))
907 break;
909 if (ncp != (ncache_t *)hp) {
910 nc_rmhash(ncp);
911 mutex_exit(&hp->hash_lock);
912 VN_RELE_DNLC(ncp->dp);
913 VN_RELE_DNLC(vp)
914 dnlc_free(ncp);
915 ncs.ncs_purge_total.value.ui64++;
916 return (1);
918 mutex_exit(&hp->hash_lock);
919 } while (hp != end);
920 return (0);
924 * Perform a reverse lookup in the DNLC. This will find the first occurrence of
925 * the vnode. If successful, it will return the vnode of the parent, and the
926 * name of the entry in the given buffer. If it cannot be found, or the buffer
927 * is too small, then it will return NULL. Note that this is a highly
928 * inefficient function, since the DNLC is constructed solely for forward
929 * lookups.
931 vnode_t *
932 dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
934 nc_hash_t *nch;
935 ncache_t *ncp;
936 vnode_t *pvp;
938 if (!doingcache)
939 return (NULL);
941 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
942 mutex_enter(&nch->hash_lock);
943 ncp = nch->hash_next;
944 while (ncp != (ncache_t *)nch) {
946 * We ignore '..' entries since it can create
947 * confusion and infinite loops.
949 if (ncp->vp == vp && !(ncp->namlen == 2 &&
950 0 == bcmp(ncp->name, "..", 2)) &&
951 ncp->namlen < buflen) {
952 bcopy(ncp->name, buf, ncp->namlen);
953 buf[ncp->namlen] = '\0';
954 pvp = ncp->dp;
955 /* VN_HOLD 2 of 2 in this file */
956 VN_HOLD_CALLER(pvp);
957 mutex_exit(&nch->hash_lock);
958 return (pvp);
960 ncp = ncp->hash_next;
962 mutex_exit(&nch->hash_lock);
965 return (NULL);
968 * Utility routine to search for a cache entry. Return the
969 * ncache entry if found, NULL otherwise.
971 static ncache_t *
972 dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, int hash)
974 nc_hash_t *hp;
975 ncache_t *ncp;
977 hp = &nc_hash[hash & nc_hashmask];
979 for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) {
980 if (ncp->hash == hash &&
981 ncp->dp == dp &&
982 ncp->namlen == namlen &&
983 bcmp(ncp->name, name, namlen) == 0)
984 return (ncp);
986 return (NULL);
989 #if ((1 << NBBY) - 1) < (MAXNAMELEN - 1)
990 #error ncache_t name length representation is too small
991 #endif
993 void
994 dnlc_reduce_cache(void *reduce_percent)
996 if (dnlc_reduce_idle && (dnlc_nentries >= ncsize || reduce_percent)) {
997 dnlc_reduce_idle = 0;
998 if ((taskq_dispatch(system_taskq, do_dnlc_reduce_cache,
999 reduce_percent, TQ_NOSLEEP)) == (uintptr_t)NULL)
1000 dnlc_reduce_idle = 1;
1005 * Get a new name cache entry.
1006 * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory
1007 * is short then just return NULL. If we're over ncsize then kick off a
1008 * thread to free some in use entries down to dnlc_nentries_low_water.
1009 * Caller must initialise all fields except namlen.
1010 * Component names are defined to be less than MAXNAMELEN
1011 * which includes a null.
1013 static ncache_t *
1014 dnlc_get(uchar_t namlen)
1016 ncache_t *ncp;
1018 if (dnlc_nentries > dnlc_max_nentries) {
1019 dnlc_max_nentries_cnt++; /* keep a statistic */
1020 return (NULL);
1022 ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP);
1023 if (ncp == NULL) {
1024 return (NULL);
1026 ncp->namlen = namlen;
1027 atomic_inc_32(&dnlc_nentries);
1028 dnlc_reduce_cache(NULL);
1029 return (ncp);
1033 * Taskq routine to free up name cache entries to reduce the
1034 * cache size to the low water mark if "reduce_percent" is not provided.
1035 * If "reduce_percent" is provided, reduce cache size by
1036 * (ncsize_onepercent * reduce_percent).
1038 /*ARGSUSED*/
1039 static void
1040 do_dnlc_reduce_cache(void *reduce_percent)
1042 nc_hash_t *hp = dnlc_free_rotor, *start_hp = hp;
1043 vnode_t *vp;
1044 ncache_t *ncp;
1045 int cnt;
1046 uint_t low_water = dnlc_nentries_low_water;
1048 if (reduce_percent) {
1049 uint_t reduce_cnt;
1052 * Never try to reduce the current number
1053 * of cache entries below 3% of ncsize.
1055 if (dnlc_nentries <= ncsize_min_percent) {
1056 dnlc_reduce_idle = 1;
1057 return;
1059 reduce_cnt = ncsize_onepercent *
1060 (uint_t)(uintptr_t)reduce_percent;
1062 if (reduce_cnt > dnlc_nentries ||
1063 dnlc_nentries - reduce_cnt < ncsize_min_percent)
1064 low_water = ncsize_min_percent;
1065 else
1066 low_water = dnlc_nentries - reduce_cnt;
1069 do {
1071 * Find the first non empty hash queue without locking.
1072 * Only look at each hash queue once to avoid an infinite loop.
1074 do {
1075 if (++hp == &nc_hash[nc_hashsz])
1076 hp = nc_hash;
1077 } while (hp->hash_next == (ncache_t *)hp && hp != start_hp);
1079 /* return if all hash queues are empty. */
1080 if (hp->hash_next == (ncache_t *)hp) {
1081 dnlc_reduce_idle = 1;
1082 return;
1085 mutex_enter(&hp->hash_lock);
1086 for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp;
1087 ncp = ncp->hash_prev, cnt++) {
1088 vp = ncp->vp;
1090 * A name cache entry with a reference count
1091 * of one is only referenced by the dnlc.
1092 * Also negative cache entries are purged first.
1094 if (!vn_has_cached_data(vp) &&
1095 ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) {
1096 ncs.ncs_pick_heur.value.ui64++;
1097 goto found;
1100 * Remove from the end of the chain if the
1101 * chain is too long
1103 if (cnt > dnlc_long_chain) {
1104 ncp = hp->hash_prev;
1105 ncs.ncs_pick_last.value.ui64++;
1106 vp = ncp->vp;
1107 goto found;
1110 /* check for race and continue */
1111 if (hp->hash_next == (ncache_t *)hp) {
1112 mutex_exit(&hp->hash_lock);
1113 continue;
1116 ncp = hp->hash_prev; /* pick the last one in the hash queue */
1117 ncs.ncs_pick_last.value.ui64++;
1118 vp = ncp->vp;
1119 found:
1121 * Remove from hash chain.
1123 nc_rmhash(ncp);
1124 mutex_exit(&hp->hash_lock);
1125 VN_RELE_DNLC(vp);
1126 VN_RELE_DNLC(ncp->dp);
1127 dnlc_free(ncp);
1128 } while (dnlc_nentries > low_water);
1130 dnlc_free_rotor = hp;
1131 dnlc_reduce_idle = 1;
1135 * Directory caching routines
1136 * ==========================
1138 * See dnlc.h for details of the interfaces below.
1142 * Lookup up an entry in a complete or partial directory cache.
1144 dcret_t
1145 dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handle)
1147 dircache_t *dcp;
1148 dcentry_t *dep;
1149 int hash;
1150 int ret;
1151 uchar_t namlen;
1154 * can test without lock as we are only a cache
1156 if (!VALID_DIR_CACHE(dcap->dca_dircache)) {
1157 ncs.ncs_dir_misses.value.ui64++;
1158 return (DNOCACHE);
1161 if (!dnlc_dir_enable) {
1162 return (DNOCACHE);
1165 mutex_enter(&dcap->dca_lock);
1166 dcp = (dircache_t *)dcap->dca_dircache;
1167 if (VALID_DIR_CACHE(dcp)) {
1168 dcp->dc_actime = ddi_get_lbolt64();
1169 DNLC_DIR_HASH(name, hash, namlen);
1170 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1171 while (dep != NULL) {
1172 if ((dep->de_hash == hash) &&
1173 (namlen == dep->de_namelen) &&
1174 bcmp(dep->de_name, name, namlen) == 0) {
1175 *handle = dep->de_handle;
1176 mutex_exit(&dcap->dca_lock);
1177 ncs.ncs_dir_hits.value.ui64++;
1178 return (DFOUND);
1180 dep = dep->de_next;
1182 if (dcp->dc_complete) {
1183 ret = DNOENT;
1184 } else {
1185 ret = DNOCACHE;
1187 mutex_exit(&dcap->dca_lock);
1188 return (ret);
1189 } else {
1190 mutex_exit(&dcap->dca_lock);
1191 ncs.ncs_dir_misses.value.ui64++;
1192 return (DNOCACHE);
1197 * Start a new directory cache. An estimate of the number of
1198 * entries is provided to as a quick check to ensure the directory
1199 * is cacheable.
1201 dcret_t
1202 dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries)
1204 dircache_t *dcp;
1206 if (!dnlc_dir_enable ||
1207 (num_entries < dnlc_dir_min_size)) {
1208 return (DNOCACHE);
1211 if (num_entries > dnlc_dir_max_size) {
1212 return (DTOOBIG);
1215 mutex_enter(&dc_head.dch_lock);
1216 mutex_enter(&dcap->dca_lock);
1218 if (dcap->dca_dircache == DC_RET_LOW_MEM) {
1219 dcap->dca_dircache = NULL;
1220 mutex_exit(&dcap->dca_lock);
1221 mutex_exit(&dc_head.dch_lock);
1222 return (DNOMEM);
1226 * Check if there's currently a cache.
1227 * This probably only occurs on a race.
1229 if (dcap->dca_dircache != NULL) {
1230 mutex_exit(&dcap->dca_lock);
1231 mutex_exit(&dc_head.dch_lock);
1232 return (DNOCACHE);
1236 * Allocate the dircache struct, entry and free space hash tables.
1237 * These tables are initially just one entry but dynamically resize
1238 * when entries and free space are added or removed.
1240 if ((dcp = kmem_zalloc(sizeof (dircache_t), KM_NOSLEEP)) == NULL) {
1241 goto error;
1243 if ((dcp->dc_namehash = kmem_zalloc(sizeof (dcentry_t *),
1244 KM_NOSLEEP)) == NULL) {
1245 goto error;
1247 if ((dcp->dc_freehash = kmem_zalloc(sizeof (dcfree_t *),
1248 KM_NOSLEEP)) == NULL) {
1249 goto error;
1252 dcp->dc_anchor = dcap; /* set back pointer to anchor */
1253 dcap->dca_dircache = dcp;
1255 /* add into head of global chain */
1256 dcp->dc_next = dc_head.dch_next;
1257 dcp->dc_prev = (dircache_t *)&dc_head;
1258 dcp->dc_next->dc_prev = dcp;
1259 dc_head.dch_next = dcp;
1261 mutex_exit(&dcap->dca_lock);
1262 mutex_exit(&dc_head.dch_lock);
1263 ncs.ncs_cur_dirs.value.ui64++;
1264 ncs.ncs_dirs_cached.value.ui64++;
1265 return (DOK);
1266 error:
1267 if (dcp != NULL) {
1268 if (dcp->dc_namehash) {
1269 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *));
1271 kmem_free(dcp, sizeof (dircache_t));
1274 * Must also kmem_free dcp->dc_freehash if more error cases are added
1276 mutex_exit(&dcap->dca_lock);
1277 mutex_exit(&dc_head.dch_lock);
1278 ncs.ncs_dir_start_nm.value.ui64++;
1279 return (DNOCACHE);
1283 * Add a directopry entry to a partial or complete directory cache.
1285 dcret_t
1286 dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
1288 dircache_t *dcp;
1289 dcentry_t **hp, *dep;
1290 int hash;
1291 uint_t capacity;
1292 uchar_t namlen;
1295 * Allocate the dcentry struct, including the variable
1296 * size name. Note, the null terminator is not copied.
1298 * We do this outside the lock to avoid possible deadlock if
1299 * dnlc_dir_reclaim() is called as a result of memory shortage.
1301 DNLC_DIR_HASH(name, hash, namlen);
1302 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1303 if (dep == NULL) {
1304 #ifdef DEBUG
1306 * The kmem allocator generates random failures for
1307 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
1308 * So try again before we blow away a perfectly good cache.
1309 * This is done not to cover an error but purely for
1310 * performance running a debug kernel.
1311 * This random error only occurs in debug mode.
1313 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1314 if (dep != NULL)
1315 goto ok;
1316 #endif
1317 ncs.ncs_dir_add_nm.value.ui64++;
1319 * Free a directory cache. This may be the one we are
1320 * called with.
1322 dnlc_dir_reclaim(NULL);
1323 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1324 if (dep == NULL) {
1326 * still no memory, better delete this cache
1328 mutex_enter(&dcap->dca_lock);
1329 dcp = (dircache_t *)dcap->dca_dircache;
1330 if (VALID_DIR_CACHE(dcp)) {
1331 dnlc_dir_abort(dcp);
1332 dcap->dca_dircache = DC_RET_LOW_MEM;
1334 mutex_exit(&dcap->dca_lock);
1335 ncs.ncs_dir_addabort.value.ui64++;
1336 return (DNOCACHE);
1339 * fall through as if the 1st kmem_alloc had worked
1342 #ifdef DEBUG
1344 #endif
1345 mutex_enter(&dcap->dca_lock);
1346 dcp = (dircache_t *)dcap->dca_dircache;
1347 if (VALID_DIR_CACHE(dcp)) {
1349 * If the total number of entries goes above the max
1350 * then free this cache
1352 if ((dcp->dc_num_entries + dcp->dc_num_free) >
1353 dnlc_dir_max_size) {
1354 mutex_exit(&dcap->dca_lock);
1355 dnlc_dir_purge(dcap);
1356 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
1357 ncs.ncs_dir_add_max.value.ui64++;
1358 return (DTOOBIG);
1360 dcp->dc_num_entries++;
1361 capacity = (dcp->dc_nhash_mask + 1) << dnlc_dir_hash_size_shift;
1362 if (dcp->dc_num_entries >=
1363 (capacity << dnlc_dir_hash_resize_shift)) {
1364 dnlc_dir_adjust_nhash(dcp);
1366 hp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1369 * Initialise and chain in new entry
1371 dep->de_handle = handle;
1372 dep->de_hash = hash;
1374 * Note de_namelen is a uchar_t to conserve space
1375 * and alignment padding. The max length of any
1376 * pathname component is defined as MAXNAMELEN
1377 * which is 256 (including the terminating null).
1378 * So provided this doesn't change, we don't include the null,
1379 * we always use bcmp to compare strings, and we don't
1380 * start storing full names, then we are ok.
1381 * The space savings is worth it.
1383 dep->de_namelen = namlen;
1384 bcopy(name, dep->de_name, namlen);
1385 dep->de_next = *hp;
1386 *hp = dep;
1387 dcp->dc_actime = ddi_get_lbolt64();
1388 mutex_exit(&dcap->dca_lock);
1389 ncs.ncs_dir_num_ents.value.ui64++;
1390 return (DOK);
1391 } else {
1392 mutex_exit(&dcap->dca_lock);
1393 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
1394 return (DNOCACHE);
1399 * Add free space to a partial or complete directory cache.
1401 dcret_t
1402 dnlc_dir_add_space(dcanchor_t *dcap, uint_t len, uint64_t handle)
1404 dircache_t *dcp;
1405 dcfree_t *dfp, **hp;
1406 uint_t capacity;
1409 * We kmem_alloc outside the lock to avoid possible deadlock if
1410 * dnlc_dir_reclaim() is called as a result of memory shortage.
1412 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1413 if (dfp == NULL) {
1414 #ifdef DEBUG
1416 * The kmem allocator generates random failures for
1417 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
1418 * So try again before we blow away a perfectly good cache.
1419 * This random error only occurs in debug mode
1421 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1422 if (dfp != NULL)
1423 goto ok;
1424 #endif
1425 ncs.ncs_dir_add_nm.value.ui64++;
1427 * Free a directory cache. This may be the one we are
1428 * called with.
1430 dnlc_dir_reclaim(NULL);
1431 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1432 if (dfp == NULL) {
1434 * still no memory, better delete this cache
1436 mutex_enter(&dcap->dca_lock);
1437 dcp = (dircache_t *)dcap->dca_dircache;
1438 if (VALID_DIR_CACHE(dcp)) {
1439 dnlc_dir_abort(dcp);
1440 dcap->dca_dircache = DC_RET_LOW_MEM;
1442 mutex_exit(&dcap->dca_lock);
1443 ncs.ncs_dir_addabort.value.ui64++;
1444 return (DNOCACHE);
1447 * fall through as if the 1st kmem_alloc had worked
1451 #ifdef DEBUG
1453 #endif
1454 mutex_enter(&dcap->dca_lock);
1455 dcp = (dircache_t *)dcap->dca_dircache;
1456 if (VALID_DIR_CACHE(dcp)) {
1457 if ((dcp->dc_num_entries + dcp->dc_num_free) >
1458 dnlc_dir_max_size) {
1459 mutex_exit(&dcap->dca_lock);
1460 dnlc_dir_purge(dcap);
1461 kmem_cache_free(dnlc_dir_space_cache, dfp);
1462 ncs.ncs_dir_add_max.value.ui64++;
1463 return (DTOOBIG);
1465 dcp->dc_num_free++;
1466 capacity = (dcp->dc_fhash_mask + 1) << dnlc_dir_hash_size_shift;
1467 if (dcp->dc_num_free >=
1468 (capacity << dnlc_dir_hash_resize_shift)) {
1469 dnlc_dir_adjust_fhash(dcp);
1472 * Initialise and chain a new entry
1474 dfp->df_handle = handle;
1475 dfp->df_len = len;
1476 dcp->dc_actime = ddi_get_lbolt64();
1477 hp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
1478 dfp->df_next = *hp;
1479 *hp = dfp;
1480 mutex_exit(&dcap->dca_lock);
1481 ncs.ncs_dir_num_ents.value.ui64++;
1482 return (DOK);
1483 } else {
1484 mutex_exit(&dcap->dca_lock);
1485 kmem_cache_free(dnlc_dir_space_cache, dfp);
1486 return (DNOCACHE);
1491 * Mark a directory cache as complete.
1493 void
1494 dnlc_dir_complete(dcanchor_t *dcap)
1496 dircache_t *dcp;
1498 mutex_enter(&dcap->dca_lock);
1499 dcp = (dircache_t *)dcap->dca_dircache;
1500 if (VALID_DIR_CACHE(dcp)) {
1501 dcp->dc_complete = B_TRUE;
1503 mutex_exit(&dcap->dca_lock);
1507 * Internal routine to delete a partial or full directory cache.
1508 * No additional locking needed.
1510 static void
1511 dnlc_dir_abort(dircache_t *dcp)
1513 dcentry_t *dep, *nhp;
1514 dcfree_t *fep, *fhp;
1515 uint_t nhtsize = dcp->dc_nhash_mask + 1; /* name hash table size */
1516 uint_t fhtsize = dcp->dc_fhash_mask + 1; /* free hash table size */
1517 uint_t i;
1520 * Free up the cached name entries and hash table
1522 for (i = 0; i < nhtsize; i++) { /* for each hash bucket */
1523 nhp = dcp->dc_namehash[i];
1524 while (nhp != NULL) { /* for each chained entry */
1525 dep = nhp->de_next;
1526 kmem_free(nhp, sizeof (dcentry_t) - 1 +
1527 nhp->de_namelen);
1528 nhp = dep;
1531 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * nhtsize);
1534 * Free up the free space entries and hash table
1536 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
1537 fhp = dcp->dc_freehash[i];
1538 while (fhp != NULL) { /* for each chained entry */
1539 fep = fhp->df_next;
1540 kmem_cache_free(dnlc_dir_space_cache, fhp);
1541 fhp = fep;
1544 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * fhtsize);
1547 * Finally free the directory cache structure itself
1549 ncs.ncs_dir_num_ents.value.ui64 -= (dcp->dc_num_entries +
1550 dcp->dc_num_free);
1551 kmem_free(dcp, sizeof (dircache_t));
1552 ncs.ncs_cur_dirs.value.ui64--;
1556 * Remove a partial or complete directory cache
1558 void
1559 dnlc_dir_purge(dcanchor_t *dcap)
1561 dircache_t *dcp;
1563 mutex_enter(&dc_head.dch_lock);
1564 mutex_enter(&dcap->dca_lock);
1565 dcp = (dircache_t *)dcap->dca_dircache;
1566 if (!VALID_DIR_CACHE(dcp)) {
1567 mutex_exit(&dcap->dca_lock);
1568 mutex_exit(&dc_head.dch_lock);
1569 return;
1571 dcap->dca_dircache = NULL;
1573 * Unchain from global list
1575 dcp->dc_prev->dc_next = dcp->dc_next;
1576 dcp->dc_next->dc_prev = dcp->dc_prev;
1577 mutex_exit(&dcap->dca_lock);
1578 mutex_exit(&dc_head.dch_lock);
1579 dnlc_dir_abort(dcp);
1583 * Remove an entry from a complete or partial directory cache.
1584 * Return the handle if it's non null.
1586 dcret_t
1587 dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep)
1589 dircache_t *dcp;
1590 dcentry_t **prevpp, *te;
1591 uint_t capacity;
1592 int hash;
1593 int ret;
1594 uchar_t namlen;
1596 if (!dnlc_dir_enable) {
1597 return (DNOCACHE);
1600 mutex_enter(&dcap->dca_lock);
1601 dcp = (dircache_t *)dcap->dca_dircache;
1602 if (VALID_DIR_CACHE(dcp)) {
1603 dcp->dc_actime = ddi_get_lbolt64();
1604 if (dcp->dc_nhash_mask > 0) { /* ie not minimum */
1605 capacity = (dcp->dc_nhash_mask + 1) <<
1606 dnlc_dir_hash_size_shift;
1607 if (dcp->dc_num_entries <=
1608 (capacity >> dnlc_dir_hash_resize_shift)) {
1609 dnlc_dir_adjust_nhash(dcp);
1612 DNLC_DIR_HASH(name, hash, namlen);
1613 prevpp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1614 while (*prevpp != NULL) {
1615 if (((*prevpp)->de_hash == hash) &&
1616 (namlen == (*prevpp)->de_namelen) &&
1617 bcmp((*prevpp)->de_name, name, namlen) == 0) {
1618 if (handlep != NULL) {
1619 *handlep = (*prevpp)->de_handle;
1621 te = *prevpp;
1622 *prevpp = (*prevpp)->de_next;
1623 kmem_free(te, sizeof (dcentry_t) - 1 +
1624 te->de_namelen);
1627 * If the total number of entries
1628 * falls below half the minimum number
1629 * of entries then free this cache.
1631 if (--dcp->dc_num_entries <
1632 (dnlc_dir_min_size >> 1)) {
1633 mutex_exit(&dcap->dca_lock);
1634 dnlc_dir_purge(dcap);
1635 } else {
1636 mutex_exit(&dcap->dca_lock);
1638 ncs.ncs_dir_num_ents.value.ui64--;
1639 return (DFOUND);
1641 prevpp = &((*prevpp)->de_next);
1643 if (dcp->dc_complete) {
1644 ncs.ncs_dir_reme_fai.value.ui64++;
1645 ret = DNOENT;
1646 } else {
1647 ret = DNOCACHE;
1649 mutex_exit(&dcap->dca_lock);
1650 return (ret);
1651 } else {
1652 mutex_exit(&dcap->dca_lock);
1653 return (DNOCACHE);
1659 * Remove free space of at least the given length from a complete
1660 * or partial directory cache.
1662 dcret_t
1663 dnlc_dir_rem_space_by_len(dcanchor_t *dcap, uint_t len, uint64_t *handlep)
1665 dircache_t *dcp;
1666 dcfree_t **prevpp, *tfp;
1667 uint_t fhtsize; /* free hash table size */
1668 uint_t i;
1669 uint_t capacity;
1670 int ret;
1672 if (!dnlc_dir_enable) {
1673 return (DNOCACHE);
1676 mutex_enter(&dcap->dca_lock);
1677 dcp = (dircache_t *)dcap->dca_dircache;
1678 if (VALID_DIR_CACHE(dcp)) {
1679 dcp->dc_actime = ddi_get_lbolt64();
1680 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
1681 capacity = (dcp->dc_fhash_mask + 1) <<
1682 dnlc_dir_hash_size_shift;
1683 if (dcp->dc_num_free <=
1684 (capacity >> dnlc_dir_hash_resize_shift)) {
1685 dnlc_dir_adjust_fhash(dcp);
1689 * Search for an entry of the appropriate size
1690 * on a first fit basis.
1692 fhtsize = dcp->dc_fhash_mask + 1;
1693 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
1694 prevpp = &(dcp->dc_freehash[i]);
1695 while (*prevpp != NULL) {
1696 if ((*prevpp)->df_len >= len) {
1697 *handlep = (*prevpp)->df_handle;
1698 tfp = *prevpp;
1699 *prevpp = (*prevpp)->df_next;
1700 dcp->dc_num_free--;
1701 mutex_exit(&dcap->dca_lock);
1702 kmem_cache_free(dnlc_dir_space_cache,
1703 tfp);
1704 ncs.ncs_dir_num_ents.value.ui64--;
1705 return (DFOUND);
1707 prevpp = &((*prevpp)->df_next);
1710 if (dcp->dc_complete) {
1711 ret = DNOENT;
1712 } else {
1713 ret = DNOCACHE;
1715 mutex_exit(&dcap->dca_lock);
1716 return (ret);
1717 } else {
1718 mutex_exit(&dcap->dca_lock);
1719 return (DNOCACHE);
1724 * Remove free space with the given handle from a complete or partial
1725 * directory cache.
1727 dcret_t
1728 dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle)
1730 dircache_t *dcp;
1731 dcfree_t **prevpp, *tfp;
1732 uint_t capacity;
1733 int ret;
1735 if (!dnlc_dir_enable) {
1736 return (DNOCACHE);
1739 mutex_enter(&dcap->dca_lock);
1740 dcp = (dircache_t *)dcap->dca_dircache;
1741 if (VALID_DIR_CACHE(dcp)) {
1742 dcp->dc_actime = ddi_get_lbolt64();
1743 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
1744 capacity = (dcp->dc_fhash_mask + 1) <<
1745 dnlc_dir_hash_size_shift;
1746 if (dcp->dc_num_free <=
1747 (capacity >> dnlc_dir_hash_resize_shift)) {
1748 dnlc_dir_adjust_fhash(dcp);
1753 * search for the exact entry
1755 prevpp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
1756 while (*prevpp != NULL) {
1757 if ((*prevpp)->df_handle == handle) {
1758 tfp = *prevpp;
1759 *prevpp = (*prevpp)->df_next;
1760 dcp->dc_num_free--;
1761 mutex_exit(&dcap->dca_lock);
1762 kmem_cache_free(dnlc_dir_space_cache, tfp);
1763 ncs.ncs_dir_num_ents.value.ui64--;
1764 return (DFOUND);
1766 prevpp = &((*prevpp)->df_next);
1768 if (dcp->dc_complete) {
1769 ncs.ncs_dir_rems_fai.value.ui64++;
1770 ret = DNOENT;
1771 } else {
1772 ret = DNOCACHE;
1774 mutex_exit(&dcap->dca_lock);
1775 return (ret);
1776 } else {
1777 mutex_exit(&dcap->dca_lock);
1778 return (DNOCACHE);
1783 * Update the handle of an directory cache entry.
1785 dcret_t
1786 dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle)
1788 dircache_t *dcp;
1789 dcentry_t *dep;
1790 int hash;
1791 int ret;
1792 uchar_t namlen;
1794 if (!dnlc_dir_enable) {
1795 return (DNOCACHE);
1798 mutex_enter(&dcap->dca_lock);
1799 dcp = (dircache_t *)dcap->dca_dircache;
1800 if (VALID_DIR_CACHE(dcp)) {
1801 dcp->dc_actime = ddi_get_lbolt64();
1802 DNLC_DIR_HASH(name, hash, namlen);
1803 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1804 while (dep != NULL) {
1805 if ((dep->de_hash == hash) &&
1806 (namlen == dep->de_namelen) &&
1807 bcmp(dep->de_name, name, namlen) == 0) {
1808 dep->de_handle = handle;
1809 mutex_exit(&dcap->dca_lock);
1810 return (DFOUND);
1812 dep = dep->de_next;
1814 if (dcp->dc_complete) {
1815 ncs.ncs_dir_upd_fail.value.ui64++;
1816 ret = DNOENT;
1817 } else {
1818 ret = DNOCACHE;
1820 mutex_exit(&dcap->dca_lock);
1821 return (ret);
1822 } else {
1823 mutex_exit(&dcap->dca_lock);
1824 return (DNOCACHE);
1828 void
1829 dnlc_dir_fini(dcanchor_t *dcap)
1831 dircache_t *dcp;
1833 mutex_enter(&dc_head.dch_lock);
1834 mutex_enter(&dcap->dca_lock);
1835 dcp = (dircache_t *)dcap->dca_dircache;
1836 if (VALID_DIR_CACHE(dcp)) {
1838 * Unchain from global list
1840 ncs.ncs_dir_finipurg.value.ui64++;
1841 dcp->dc_prev->dc_next = dcp->dc_next;
1842 dcp->dc_next->dc_prev = dcp->dc_prev;
1843 } else {
1844 dcp = NULL;
1846 dcap->dca_dircache = NULL;
1847 mutex_exit(&dcap->dca_lock);
1848 mutex_exit(&dc_head.dch_lock);
1849 mutex_destroy(&dcap->dca_lock);
1850 if (dcp) {
1851 dnlc_dir_abort(dcp);
1856 * Reclaim callback for dnlc directory caching.
1857 * Invoked by the kernel memory allocator when memory gets tight.
1858 * This is a pretty serious condition and can lead easily lead to system
1859 * hangs if not enough space is returned.
1861 * Deciding which directory (or directories) to purge is tricky.
1862 * Purging everything is an overkill, but purging just the oldest used
1863 * was found to lead to hangs. The largest cached directories use the
1864 * most memory, but take the most effort to rebuild, whereas the smaller
1865 * ones have little value and give back little space. So what to do?
1867 * The current policy is to continue purging the oldest used directories
1868 * until at least dnlc_dir_min_reclaim directory entries have been purged.
1870 /*ARGSUSED*/
1871 static void
1872 dnlc_dir_reclaim(void *unused)
1874 dircache_t *dcp, *oldest;
1875 uint_t dirent_cnt = 0;
1877 mutex_enter(&dc_head.dch_lock);
1878 while (dirent_cnt < dnlc_dir_min_reclaim) {
1879 dcp = dc_head.dch_next;
1880 oldest = NULL;
1881 while (dcp != (dircache_t *)&dc_head) {
1882 if (oldest == NULL) {
1883 oldest = dcp;
1884 } else {
1885 if (dcp->dc_actime < oldest->dc_actime) {
1886 oldest = dcp;
1889 dcp = dcp->dc_next;
1891 if (oldest == NULL) {
1892 /* nothing to delete */
1893 mutex_exit(&dc_head.dch_lock);
1894 return;
1897 * remove from directory chain and purge
1899 oldest->dc_prev->dc_next = oldest->dc_next;
1900 oldest->dc_next->dc_prev = oldest->dc_prev;
1901 mutex_enter(&oldest->dc_anchor->dca_lock);
1903 * If this was the last entry then it must be too large.
1904 * Mark it as such by saving a special dircache_t
1905 * pointer (DC_RET_LOW_MEM) in the anchor. The error DNOMEM
1906 * will be presented to the caller of dnlc_dir_start()
1908 if (oldest->dc_next == oldest->dc_prev) {
1909 oldest->dc_anchor->dca_dircache = DC_RET_LOW_MEM;
1910 ncs.ncs_dir_rec_last.value.ui64++;
1911 } else {
1912 oldest->dc_anchor->dca_dircache = NULL;
1913 ncs.ncs_dir_recl_any.value.ui64++;
1915 mutex_exit(&oldest->dc_anchor->dca_lock);
1916 dirent_cnt += oldest->dc_num_entries;
1917 dnlc_dir_abort(oldest);
1919 mutex_exit(&dc_head.dch_lock);
1923 * Dynamically grow or shrink the size of the name hash table
1925 static void
1926 dnlc_dir_adjust_nhash(dircache_t *dcp)
1928 dcentry_t **newhash, *dep, **nhp, *tep;
1929 uint_t newsize;
1930 uint_t oldsize;
1931 uint_t newsizemask;
1932 int i;
1935 * Allocate new hash table
1937 newsize = dcp->dc_num_entries >> dnlc_dir_hash_size_shift;
1938 newhash = kmem_zalloc(sizeof (dcentry_t *) * newsize, KM_NOSLEEP);
1939 if (newhash == NULL) {
1941 * System is short on memory just return
1942 * Note, the old hash table is still usable.
1943 * This return is unlikely to repeatedy occur, because
1944 * either some other directory caches will be reclaimed
1945 * due to memory shortage, thus freeing memory, or this
1946 * directory cahe will be reclaimed.
1948 return;
1950 oldsize = dcp->dc_nhash_mask + 1;
1951 dcp->dc_nhash_mask = newsizemask = newsize - 1;
1954 * Move entries from the old table to the new
1956 for (i = 0; i < oldsize; i++) { /* for each hash bucket */
1957 dep = dcp->dc_namehash[i];
1958 while (dep != NULL) { /* for each chained entry */
1959 tep = dep;
1960 dep = dep->de_next;
1961 nhp = &newhash[tep->de_hash & newsizemask];
1962 tep->de_next = *nhp;
1963 *nhp = tep;
1968 * delete old hash table and set new one in place
1970 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * oldsize);
1971 dcp->dc_namehash = newhash;
1975 * Dynamically grow or shrink the size of the free space hash table
1977 static void
1978 dnlc_dir_adjust_fhash(dircache_t *dcp)
1980 dcfree_t **newhash, *dfp, **nhp, *tfp;
1981 uint_t newsize;
1982 uint_t oldsize;
1983 int i;
1986 * Allocate new hash table
1988 newsize = dcp->dc_num_free >> dnlc_dir_hash_size_shift;
1989 newhash = kmem_zalloc(sizeof (dcfree_t *) * newsize, KM_NOSLEEP);
1990 if (newhash == NULL) {
1992 * System is short on memory just return
1993 * Note, the old hash table is still usable.
1994 * This return is unlikely to repeatedy occur, because
1995 * either some other directory caches will be reclaimed
1996 * due to memory shortage, thus freeing memory, or this
1997 * directory cahe will be reclaimed.
1999 return;
2001 oldsize = dcp->dc_fhash_mask + 1;
2002 dcp->dc_fhash_mask = newsize - 1;
2005 * Move entries from the old table to the new
2007 for (i = 0; i < oldsize; i++) { /* for each hash bucket */
2008 dfp = dcp->dc_freehash[i];
2009 while (dfp != NULL) { /* for each chained entry */
2010 tfp = dfp;
2011 dfp = dfp->df_next;
2012 nhp = &newhash[DDFHASH(tfp->df_handle, dcp)];
2013 tfp->df_next = *nhp;
2014 *nhp = tfp;
2019 * delete old hash table and set new one in place
2021 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * oldsize);
2022 dcp->dc_freehash = newhash;