8857 zio_remove_child() panic due to already destroyed parent zio
[unleashed.git] / usr / src / uts / common / fs / dnlc.c
blob0ec57ff7f7a41e55a4e261e0bd3403de1c193b47
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, Joyent, Inc.
24 * Copyright (c) 2017 by Delphix. All rights reserved.
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
40 #include <sys/types.h>
41 #include <sys/systm.h>
42 #include <sys/param.h>
43 #include <sys/t_lock.h>
44 #include <sys/systm.h>
45 #include <sys/vfs.h>
46 #include <sys/vnode.h>
47 #include <sys/dnlc.h>
48 #include <sys/kmem.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/bitmap.h>
52 #include <sys/var.h>
53 #include <sys/sysmacros.h>
54 #include <sys/kstat.h>
55 #include <sys/atomic.h>
56 #include <sys/taskq.h>
59 * Directory name lookup cache.
60 * Based on code originally done by Robert Elz at Melbourne.
62 * Names found by directory scans are retained in a cache
63 * for future reference. Each hash chain is ordered by LRU
64 * Cache is indexed by hash value obtained from (vp, name)
65 * where the vp refers to the directory containing the name.
69 * We want to be able to identify files that are referenced only by the DNLC.
70 * When adding a reference from the DNLC, call VN_HOLD_DNLC instead of VN_HOLD,
71 * since multiple DNLC references should only be counted once in v_count. The
72 * VN_HOLD macro itself is aliased to VN_HOLD_CALLER in this file to help
73 * differentiate the behaviors. (Unfortunately it is not possible to #undef
74 * VN_HOLD and retain VN_HOLD_CALLER. Ideally a Makefile rule would grep
75 * uncommented C tokens to check that VN_HOLD is referenced only once in this
76 * file, to define VN_HOLD_CALLER.)
78 #define VN_HOLD_CALLER VN_HOLD
79 #define VN_HOLD_DNLC(vp) { \
80 mutex_enter(&(vp)->v_lock); \
81 if ((vp)->v_count_dnlc == 0) { \
82 VN_HOLD_LOCKED(vp); \
83 } \
84 (vp)->v_count_dnlc++; \
85 mutex_exit(&(vp)->v_lock); \
87 #define VN_RELE_DNLC(vp) { \
88 vn_rele_dnlc(vp); \
92 * Tunable nc_hashavelen is the average length desired for this chain, from
93 * which the size of the nc_hash table is derived at create time.
95 #define NC_HASHAVELEN_DEFAULT 4
96 int nc_hashavelen = NC_HASHAVELEN_DEFAULT;
99 * NC_MOVETOFRONT is the move-to-front threshold: if the hash lookup
100 * depth exceeds this value, we move the looked-up entry to the front of
101 * its hash chain. The idea is to make sure that the most frequently
102 * accessed entries are found most quickly (by keeping them near the
103 * front of their hash chains).
105 #define NC_MOVETOFRONT 2
109 * DNLC_MAX_RELE is used to size an array on the stack when releasing
110 * vnodes. This array is used rather than calling VN_RELE() inline because
111 * all dnlc locks must be dropped by that time in order to avoid a
112 * possible deadlock. This deadlock occurs when the dnlc holds the last
113 * reference to the vnode and so the VOP_INACTIVE vector is called which
114 * can in turn call back into the dnlc. A global array was used but had
115 * many problems:
116 * 1) Actually doesn't have an upper bound on the array size as
117 * entries can be added after starting the purge.
118 * 2) The locking scheme causes a hang.
119 * 3) Caused serialisation on the global lock.
120 * 4) The array was often unnecessarily huge.
122 * Note the current value 8 allows up to 4 cache entries (to be purged
123 * from each hash chain), before having to cycle around and retry.
124 * This ought to be ample given that nc_hashavelen is typically very small.
126 #define DNLC_MAX_RELE 8 /* must be even */
129 * Hash table of name cache entries for fast lookup, dynamically
130 * allocated at startup.
132 nc_hash_t *nc_hash;
135 * Rotors. Used to select entries on a round-robin basis.
137 static nc_hash_t *dnlc_purge_fs1_rotor;
138 static nc_hash_t *dnlc_free_rotor;
141 * # of dnlc entries (uninitialized)
143 * the initial value was chosen as being
144 * a random string of bits, probably not
145 * normally chosen by a systems administrator
147 int ncsize = -1;
148 volatile uint32_t dnlc_nentries = 0; /* current num of name cache entries */
149 static int nc_hashsz; /* size of hash table */
150 static int nc_hashmask; /* size of hash table minus 1 */
153 * The dnlc_reduce_cache() taskq queue is activated when there are
154 * ncsize name cache entries and if no parameter is provided, it reduces
155 * the size down to dnlc_nentries_low_water, which is by default one
156 * hundreth less (or 99%) of ncsize.
158 * If a parameter is provided to dnlc_reduce_cache(), then we reduce
159 * the size down based on ncsize_onepercent - where ncsize_onepercent
160 * is 1% of ncsize; however, we never let dnlc_reduce_cache() reduce
161 * the size below 3% of ncsize (ncsize_min_percent).
163 #define DNLC_LOW_WATER_DIVISOR_DEFAULT 100
164 uint_t dnlc_low_water_divisor = DNLC_LOW_WATER_DIVISOR_DEFAULT;
165 uint_t dnlc_nentries_low_water;
166 int dnlc_reduce_idle = 1; /* no locking needed */
167 uint_t ncsize_onepercent;
168 uint_t ncsize_min_percent;
171 * If dnlc_nentries hits dnlc_max_nentries (twice ncsize)
172 * then this means the dnlc_reduce_cache() taskq is failing to
173 * keep up. In this case we refuse to add new entries to the dnlc
174 * until the taskq catches up.
176 uint_t dnlc_max_nentries; /* twice ncsize */
177 uint64_t dnlc_max_nentries_cnt = 0; /* statistic on times we failed */
180 * Tunable to define when we should just remove items from
181 * the end of the chain.
183 #define DNLC_LONG_CHAIN 8
184 uint_t dnlc_long_chain = DNLC_LONG_CHAIN;
187 * ncstats has been deprecated, due to the integer size of the counters
188 * which can easily overflow in the dnlc.
189 * It is maintained (at some expense) for compatability.
190 * The preferred interface is the kstat accessible nc_stats below.
192 struct ncstats ncstats;
194 struct nc_stats ncs = {
195 { "hits", KSTAT_DATA_UINT64 },
196 { "misses", KSTAT_DATA_UINT64 },
197 { "negative_cache_hits", KSTAT_DATA_UINT64 },
198 { "enters", KSTAT_DATA_UINT64 },
199 { "double_enters", KSTAT_DATA_UINT64 },
200 { "purge_total_entries", KSTAT_DATA_UINT64 },
201 { "purge_all", KSTAT_DATA_UINT64 },
202 { "purge_vp", KSTAT_DATA_UINT64 },
203 { "purge_vfs", KSTAT_DATA_UINT64 },
204 { "purge_fs1", KSTAT_DATA_UINT64 },
205 { "pick_free", KSTAT_DATA_UINT64 },
206 { "pick_heuristic", KSTAT_DATA_UINT64 },
207 { "pick_last", KSTAT_DATA_UINT64 },
209 /* directory caching stats */
211 { "dir_hits", KSTAT_DATA_UINT64 },
212 { "dir_misses", KSTAT_DATA_UINT64 },
213 { "dir_cached_current", KSTAT_DATA_UINT64 },
214 { "dir_entries_cached_current", KSTAT_DATA_UINT64 },
215 { "dir_cached_total", KSTAT_DATA_UINT64 },
216 { "dir_start_no_memory", KSTAT_DATA_UINT64 },
217 { "dir_add_no_memory", KSTAT_DATA_UINT64 },
218 { "dir_add_abort", KSTAT_DATA_UINT64 },
219 { "dir_add_max", KSTAT_DATA_UINT64 },
220 { "dir_remove_entry_fail", KSTAT_DATA_UINT64 },
221 { "dir_remove_space_fail", KSTAT_DATA_UINT64 },
222 { "dir_update_fail", KSTAT_DATA_UINT64 },
223 { "dir_fini_purge", KSTAT_DATA_UINT64 },
224 { "dir_reclaim_last", KSTAT_DATA_UINT64 },
225 { "dir_reclaim_any", KSTAT_DATA_UINT64 },
228 static int doingcache = 1;
230 vnode_t negative_cache_vnode;
233 * Insert entry at the front of the queue
235 #define nc_inshash(ncp, hp) \
237 (ncp)->hash_next = (hp)->hash_next; \
238 (ncp)->hash_prev = (ncache_t *)(hp); \
239 (hp)->hash_next->hash_prev = (ncp); \
240 (hp)->hash_next = (ncp); \
244 * Remove entry from hash queue
246 #define nc_rmhash(ncp) \
248 (ncp)->hash_prev->hash_next = (ncp)->hash_next; \
249 (ncp)->hash_next->hash_prev = (ncp)->hash_prev; \
250 (ncp)->hash_prev = NULL; \
251 (ncp)->hash_next = NULL; \
255 * Free an entry.
257 #define dnlc_free(ncp) \
259 kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \
260 atomic_dec_32(&dnlc_nentries); \
265 * Cached directory info.
266 * ======================
270 * Cached directory free space hash function.
271 * Needs the free space handle and the dcp to get the hash table size
272 * Returns the hash index.
274 #define DDFHASH(handle, dcp) ((handle >> 2) & (dcp)->dc_fhash_mask)
277 * Cached directory name entry hash function.
278 * Uses the name and returns in the input arguments the hash and the name
279 * length.
281 #define DNLC_DIR_HASH(name, hash, namelen) \
283 char Xc; \
284 const char *Xcp; \
285 hash = *name; \
286 for (Xcp = (name + 1); (Xc = *Xcp) != 0; Xcp++) \
287 hash = (hash << 4) + hash + Xc; \
288 ASSERT((Xcp - (name)) <= ((1 << NBBY) - 1)); \
289 namelen = Xcp - (name); \
292 /* special dircache_t pointer to indicate error should be returned */
294 * The anchor directory cache pointer can contain 3 types of values,
295 * 1) NULL: No directory cache
296 * 2) DC_RET_LOW_MEM (-1): There was a directory cache that found to be
297 * too big or a memory shortage occurred. This value remains in the
298 * pointer until a dnlc_dir_start() which returns the a DNOMEM error.
299 * This is kludgy but efficient and only visible in this source file.
300 * 3) A valid cache pointer.
302 #define DC_RET_LOW_MEM (dircache_t *)1
303 #define VALID_DIR_CACHE(dcp) ((dircache_t *)(dcp) > DC_RET_LOW_MEM)
305 /* Tunables */
306 uint_t dnlc_dir_enable = 1; /* disable caching directories by setting to 0 */
307 uint_t dnlc_dir_min_size = 40; /* min no of directory entries before caching */
308 uint_t dnlc_dir_max_size = UINT_MAX; /* ditto maximum */
309 uint_t dnlc_dir_hash_size_shift = 3; /* 8 entries per hash bucket */
310 uint_t dnlc_dir_min_reclaim = 350000; /* approx 1MB of dcentrys */
312 * dnlc_dir_hash_resize_shift determines when the hash tables
313 * get re-adjusted due to growth or shrinkage
314 * - currently 2 indicating that there can be at most 4
315 * times or at least one quarter the number of entries
316 * before hash table readjustment. Note that with
317 * dnlc_dir_hash_size_shift above set at 3 this would
318 * mean readjustment would occur if the average number
319 * of entries went above 32 or below 2
321 uint_t dnlc_dir_hash_resize_shift = 2; /* readjust rate */
323 static kmem_cache_t *dnlc_dir_space_cache; /* free space entry cache */
324 static dchead_t dc_head; /* anchor of cached directories */
326 /* Prototypes */
327 static ncache_t *dnlc_get(uchar_t namlen);
328 static ncache_t *dnlc_search(vnode_t *dp, const char *name, uchar_t namlen,
329 int hash);
330 static void dnlc_dir_reclaim(void *unused);
331 static void dnlc_dir_abort(dircache_t *dcp);
332 static void dnlc_dir_adjust_fhash(dircache_t *dcp);
333 static void dnlc_dir_adjust_nhash(dircache_t *dcp);
334 static void do_dnlc_reduce_cache(void *);
338 * Initialize the directory cache.
340 void
341 dnlc_init()
343 nc_hash_t *hp;
344 kstat_t *ksp;
345 int i;
348 * Set up the size of the dnlc (ncsize) and its low water mark.
350 if (ncsize == -1) {
351 /* calculate a reasonable size for the low water */
352 dnlc_nentries_low_water = 4 * (v.v_proc + maxusers) + 320;
353 ncsize = dnlc_nentries_low_water +
354 (dnlc_nentries_low_water / dnlc_low_water_divisor);
355 } else {
356 /* don't change the user specified ncsize */
357 dnlc_nentries_low_water =
358 ncsize - (ncsize / dnlc_low_water_divisor);
360 if (ncsize <= 0) {
361 doingcache = 0;
362 dnlc_dir_enable = 0; /* also disable directory caching */
363 ncsize = 0;
364 cmn_err(CE_NOTE, "name cache (dnlc) disabled");
365 return;
367 dnlc_max_nentries = ncsize * 2;
368 ncsize_onepercent = ncsize / 100;
369 ncsize_min_percent = ncsize_onepercent * 3;
372 * Initialise the hash table.
373 * Compute hash size rounding to the next power of two.
375 nc_hashsz = ncsize / nc_hashavelen;
376 nc_hashsz = 1 << highbit(nc_hashsz);
377 nc_hashmask = nc_hashsz - 1;
378 nc_hash = kmem_zalloc(nc_hashsz * sizeof (*nc_hash), KM_SLEEP);
379 for (i = 0; i < nc_hashsz; i++) {
380 hp = (nc_hash_t *)&nc_hash[i];
381 mutex_init(&hp->hash_lock, NULL, MUTEX_DEFAULT, NULL);
382 hp->hash_next = (ncache_t *)hp;
383 hp->hash_prev = (ncache_t *)hp;
387 * Initialize rotors
389 dnlc_free_rotor = dnlc_purge_fs1_rotor = &nc_hash[0];
392 * Set up the directory caching to use kmem_cache_alloc
393 * for its free space entries so that we can get a callback
394 * when the system is short on memory, to allow us to free
395 * up some memory. we don't use the constructor/deconstructor
396 * functions.
398 dnlc_dir_space_cache = kmem_cache_create("dnlc_space_cache",
399 sizeof (dcfree_t), 0, NULL, NULL, dnlc_dir_reclaim, NULL,
400 NULL, 0);
403 * Initialise the head of the cached directory structures
405 mutex_init(&dc_head.dch_lock, NULL, MUTEX_DEFAULT, NULL);
406 dc_head.dch_next = (dircache_t *)&dc_head;
407 dc_head.dch_prev = (dircache_t *)&dc_head;
410 * Put a hold on the negative cache vnode so that it never goes away
411 * (VOP_INACTIVE isn't called on it).
413 vn_reinit(&negative_cache_vnode);
416 * Initialise kstats - both the old compatability raw kind and
417 * the more extensive named stats.
419 ksp = kstat_create("unix", 0, "ncstats", "misc", KSTAT_TYPE_RAW,
420 sizeof (struct ncstats), KSTAT_FLAG_VIRTUAL);
421 if (ksp) {
422 ksp->ks_data = (void *) &ncstats;
423 kstat_install(ksp);
425 ksp = kstat_create("unix", 0, "dnlcstats", "misc", KSTAT_TYPE_NAMED,
426 sizeof (ncs) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
427 if (ksp) {
428 ksp->ks_data = (void *) &ncs;
429 kstat_install(ksp);
434 * Add a name to the directory cache.
436 void
437 dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp)
439 ncache_t *ncp;
440 nc_hash_t *hp;
441 uchar_t namlen;
442 int hash;
444 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_enter_start:");
446 if (!doingcache) {
447 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
448 "dnlc_enter_end:(%S) %d", "not caching", 0);
449 return;
453 * Get a new dnlc entry. Assume the entry won't be in the cache
454 * and initialize it now
456 DNLCHASH(name, dp, hash, namlen);
457 if ((ncp = dnlc_get(namlen)) == NULL)
458 return;
459 ncp->dp = dp;
460 VN_HOLD_DNLC(dp);
461 ncp->vp = vp;
462 VN_HOLD_DNLC(vp);
463 bcopy(name, ncp->name, namlen + 1); /* name and null */
464 ncp->hash = hash;
465 hp = &nc_hash[hash & nc_hashmask];
467 mutex_enter(&hp->hash_lock);
468 if (dnlc_search(dp, name, namlen, hash) != NULL) {
469 mutex_exit(&hp->hash_lock);
470 ncstats.dbl_enters++;
471 ncs.ncs_dbl_enters.value.ui64++;
472 VN_RELE_DNLC(dp);
473 VN_RELE_DNLC(vp);
474 dnlc_free(ncp); /* crfree done here */
475 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
476 "dnlc_enter_end:(%S) %d", "dbl enter", ncstats.dbl_enters);
477 return;
480 * Insert back into the hash chain.
482 nc_inshash(ncp, hp);
483 mutex_exit(&hp->hash_lock);
484 ncstats.enters++;
485 ncs.ncs_enters.value.ui64++;
486 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
487 "dnlc_enter_end:(%S) %d", "done", ncstats.enters);
491 * Add a name to the directory cache.
493 * This function is basically identical with
494 * dnlc_enter(). The difference is that when the
495 * desired dnlc entry is found, the vnode in the
496 * ncache is compared with the vnode passed in.
498 * If they are not equal then the ncache is
499 * updated with the passed in vnode. Otherwise
500 * it just frees up the newly allocated dnlc entry.
502 void
503 dnlc_update(vnode_t *dp, const char *name, vnode_t *vp)
505 ncache_t *ncp;
506 ncache_t *tcp;
507 vnode_t *tvp;
508 nc_hash_t *hp;
509 int hash;
510 uchar_t namlen;
512 TRACE_0(TR_FAC_NFS, TR_DNLC_ENTER_START, "dnlc_update_start:");
514 if (!doingcache) {
515 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
516 "dnlc_update_end:(%S) %d", "not caching", 0);
517 return;
521 * Get a new dnlc entry and initialize it now.
522 * If we fail to get a new entry, call dnlc_remove() to purge
523 * any existing dnlc entry including negative cache (DNLC_NO_VNODE)
524 * entry.
525 * Failure to clear an existing entry could result in false dnlc
526 * lookup (negative/stale entry).
528 DNLCHASH(name, dp, hash, namlen);
529 if ((ncp = dnlc_get(namlen)) == NULL) {
530 dnlc_remove(dp, name);
531 return;
533 ncp->dp = dp;
534 VN_HOLD_DNLC(dp);
535 ncp->vp = vp;
536 VN_HOLD_DNLC(vp);
537 bcopy(name, ncp->name, namlen + 1); /* name and null */
538 ncp->hash = hash;
539 hp = &nc_hash[hash & nc_hashmask];
541 mutex_enter(&hp->hash_lock);
542 if ((tcp = dnlc_search(dp, name, namlen, hash)) != NULL) {
543 if (tcp->vp != vp) {
544 tvp = tcp->vp;
545 tcp->vp = vp;
546 mutex_exit(&hp->hash_lock);
547 VN_RELE_DNLC(tvp);
548 ncstats.enters++;
549 ncs.ncs_enters.value.ui64++;
550 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
551 "dnlc_update_end:(%S) %d", "done", ncstats.enters);
552 } else {
553 mutex_exit(&hp->hash_lock);
554 VN_RELE_DNLC(vp);
555 ncstats.dbl_enters++;
556 ncs.ncs_dbl_enters.value.ui64++;
557 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
558 "dnlc_update_end:(%S) %d",
559 "dbl enter", ncstats.dbl_enters);
561 VN_RELE_DNLC(dp);
562 dnlc_free(ncp); /* crfree done here */
563 return;
566 * insert the new entry, since it is not in dnlc yet
568 nc_inshash(ncp, hp);
569 mutex_exit(&hp->hash_lock);
570 ncstats.enters++;
571 ncs.ncs_enters.value.ui64++;
572 TRACE_2(TR_FAC_NFS, TR_DNLC_ENTER_END,
573 "dnlc_update_end:(%S) %d", "done", ncstats.enters);
577 * Look up a name in the directory name cache.
579 * Return a doubly-held vnode if found: one hold so that it may
580 * remain in the cache for other users, the other hold so that
581 * the cache is not re-cycled and the identity of the vnode is
582 * lost before the caller can use the vnode.
584 vnode_t *
585 dnlc_lookup(vnode_t *dp, const char *name)
587 ncache_t *ncp;
588 nc_hash_t *hp;
589 vnode_t *vp;
590 int hash, depth;
591 uchar_t namlen;
593 TRACE_2(TR_FAC_NFS, TR_DNLC_LOOKUP_START,
594 "dnlc_lookup_start:dp %x name %s", dp, name);
596 if (!doingcache) {
597 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
598 "dnlc_lookup_end:%S %d vp %x name %s",
599 "not_caching", 0, NULL, name);
600 return (NULL);
603 DNLCHASH(name, dp, hash, namlen);
604 depth = 1;
605 hp = &nc_hash[hash & nc_hashmask];
606 mutex_enter(&hp->hash_lock);
608 for (ncp = hp->hash_next; ncp != (ncache_t *)hp;
609 ncp = ncp->hash_next) {
610 if (ncp->hash == hash && /* fast signature check */
611 ncp->dp == dp &&
612 ncp->namlen == namlen &&
613 bcmp(ncp->name, name, namlen) == 0) {
615 * Move this entry to the head of its hash chain
616 * if it's not already close.
618 if (depth > NC_MOVETOFRONT) {
619 ncache_t *next = ncp->hash_next;
620 ncache_t *prev = ncp->hash_prev;
622 prev->hash_next = next;
623 next->hash_prev = prev;
624 ncp->hash_next = next = hp->hash_next;
625 ncp->hash_prev = (ncache_t *)hp;
626 next->hash_prev = ncp;
627 hp->hash_next = ncp;
629 ncstats.move_to_front++;
633 * Put a hold on the vnode now so its identity
634 * can't change before the caller has a chance to
635 * put a hold on it.
637 vp = ncp->vp;
638 VN_HOLD_CALLER(vp);
639 mutex_exit(&hp->hash_lock);
640 ncstats.hits++;
641 ncs.ncs_hits.value.ui64++;
642 if (vp == DNLC_NO_VNODE) {
643 ncs.ncs_neg_hits.value.ui64++;
645 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
646 "dnlc_lookup_end:%S %d vp %x name %s", "hit",
647 ncstats.hits, vp, name);
648 return (vp);
650 depth++;
653 mutex_exit(&hp->hash_lock);
654 ncstats.misses++;
655 ncs.ncs_misses.value.ui64++;
656 TRACE_4(TR_FAC_NFS, TR_DNLC_LOOKUP_END,
657 "dnlc_lookup_end:%S %d vp %x name %s", "miss", ncstats.misses,
658 NULL, name);
659 return (NULL);
663 * Remove an entry in the directory name cache.
665 void
666 dnlc_remove(vnode_t *dp, const char *name)
668 ncache_t *ncp;
669 nc_hash_t *hp;
670 uchar_t namlen;
671 int hash;
673 if (!doingcache)
674 return;
675 DNLCHASH(name, dp, hash, namlen);
676 hp = &nc_hash[hash & nc_hashmask];
678 mutex_enter(&hp->hash_lock);
679 if (ncp = dnlc_search(dp, name, namlen, hash)) {
681 * Free up the entry
683 nc_rmhash(ncp);
684 mutex_exit(&hp->hash_lock);
685 VN_RELE_DNLC(ncp->vp);
686 VN_RELE_DNLC(ncp->dp);
687 dnlc_free(ncp);
688 return;
690 mutex_exit(&hp->hash_lock);
694 * Purge the entire cache.
696 void
697 dnlc_purge()
699 nc_hash_t *nch;
700 ncache_t *ncp;
701 int index;
702 int i;
703 vnode_t *nc_rele[DNLC_MAX_RELE];
705 if (!doingcache)
706 return;
708 ncstats.purges++;
709 ncs.ncs_purge_all.value.ui64++;
711 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
712 index = 0;
713 mutex_enter(&nch->hash_lock);
714 ncp = nch->hash_next;
715 while (ncp != (ncache_t *)nch) {
716 ncache_t *np;
718 np = ncp->hash_next;
719 nc_rele[index++] = ncp->vp;
720 nc_rele[index++] = ncp->dp;
722 nc_rmhash(ncp);
723 dnlc_free(ncp);
724 ncp = np;
725 ncs.ncs_purge_total.value.ui64++;
726 if (index == DNLC_MAX_RELE)
727 break;
729 mutex_exit(&nch->hash_lock);
731 /* Release holds on all the vnodes now that we have no locks */
732 for (i = 0; i < index; i++) {
733 VN_RELE_DNLC(nc_rele[i]);
735 if (ncp != (ncache_t *)nch) {
736 nch--; /* Do current hash chain again */
742 * Purge any cache entries referencing a vnode. Exit as soon as the dnlc
743 * reference count goes to zero (the caller still holds a reference).
745 void
746 dnlc_purge_vp(vnode_t *vp)
748 nc_hash_t *nch;
749 ncache_t *ncp;
750 int index;
751 vnode_t *nc_rele[DNLC_MAX_RELE];
753 ASSERT(vp->v_count > 0);
754 if (vp->v_count_dnlc == 0) {
755 return;
758 if (!doingcache)
759 return;
761 ncstats.purges++;
762 ncs.ncs_purge_vp.value.ui64++;
764 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
765 index = 0;
766 mutex_enter(&nch->hash_lock);
767 ncp = nch->hash_next;
768 while (ncp != (ncache_t *)nch) {
769 ncache_t *np;
771 np = ncp->hash_next;
772 if (ncp->dp == vp || ncp->vp == vp) {
773 nc_rele[index++] = ncp->vp;
774 nc_rele[index++] = ncp->dp;
775 nc_rmhash(ncp);
776 dnlc_free(ncp);
777 ncs.ncs_purge_total.value.ui64++;
778 if (index == DNLC_MAX_RELE) {
779 ncp = np;
780 break;
783 ncp = np;
785 mutex_exit(&nch->hash_lock);
787 /* Release holds on all the vnodes now that we have no locks */
788 while (index) {
789 VN_RELE_DNLC(nc_rele[--index]);
792 if (vp->v_count_dnlc == 0) {
793 return;
796 if (ncp != (ncache_t *)nch) {
797 nch--; /* Do current hash chain again */
803 * Purge cache entries referencing a vfsp. Caller supplies a count
804 * of entries to purge; up to that many will be freed. A count of
805 * zero indicates that all such entries should be purged. Returns
806 * the number of entries that were purged.
809 dnlc_purge_vfsp(vfs_t *vfsp, int count)
811 nc_hash_t *nch;
812 ncache_t *ncp;
813 int n = 0;
814 int index;
815 int i;
816 vnode_t *nc_rele[DNLC_MAX_RELE];
818 if (!doingcache)
819 return (0);
821 ncstats.purges++;
822 ncs.ncs_purge_vfs.value.ui64++;
824 for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
825 index = 0;
826 mutex_enter(&nch->hash_lock);
827 ncp = nch->hash_next;
828 while (ncp != (ncache_t *)nch) {
829 ncache_t *np;
831 np = ncp->hash_next;
832 ASSERT(ncp->dp != NULL);
833 ASSERT(ncp->vp != NULL);
834 if ((ncp->dp->v_vfsp == vfsp) ||
835 (ncp->vp->v_vfsp == vfsp)) {
836 n++;
837 nc_rele[index++] = ncp->vp;
838 nc_rele[index++] = ncp->dp;
839 nc_rmhash(ncp);
840 dnlc_free(ncp);
841 ncs.ncs_purge_total.value.ui64++;
842 if (index == DNLC_MAX_RELE) {
843 ncp = np;
844 break;
846 if (count != 0 && n >= count) {
847 break;
850 ncp = np;
852 mutex_exit(&nch->hash_lock);
853 /* Release holds on all the vnodes now that we have no locks */
854 for (i = 0; i < index; i++) {
855 VN_RELE_DNLC(nc_rele[i]);
857 if (count != 0 && n >= count) {
858 return (n);
860 if (ncp != (ncache_t *)nch) {
861 nch--; /* Do current hash chain again */
864 return (n);
868 * Purge 1 entry from the dnlc that is part of the filesystem(s)
869 * represented by 'vop'. The purpose of this routine is to allow
870 * users of the dnlc to free a vnode that is being held by the dnlc.
872 * If we find a vnode that we release which will result in
873 * freeing the underlying vnode (count was 1), return 1, 0
874 * if no appropriate vnodes found.
876 * Note, vop is not the 'right' identifier for a filesystem.
879 dnlc_fs_purge1(vnodeops_t *vop)
881 nc_hash_t *end;
882 nc_hash_t *hp;
883 ncache_t *ncp;
884 vnode_t *vp;
886 if (!doingcache)
887 return (0);
889 ncs.ncs_purge_fs1.value.ui64++;
892 * Scan the dnlc entries looking for a likely candidate.
894 hp = end = dnlc_purge_fs1_rotor;
896 do {
897 if (++hp == &nc_hash[nc_hashsz])
898 hp = nc_hash;
899 dnlc_purge_fs1_rotor = hp;
900 if (hp->hash_next == (ncache_t *)hp)
901 continue;
902 mutex_enter(&hp->hash_lock);
903 for (ncp = hp->hash_prev;
904 ncp != (ncache_t *)hp;
905 ncp = ncp->hash_prev) {
906 vp = ncp->vp;
907 if (!vn_has_cached_data(vp) && (vp->v_count == 1) &&
908 vn_matchops(vp, vop))
909 break;
911 if (ncp != (ncache_t *)hp) {
912 nc_rmhash(ncp);
913 mutex_exit(&hp->hash_lock);
914 VN_RELE_DNLC(ncp->dp);
915 VN_RELE_DNLC(vp)
916 dnlc_free(ncp);
917 ncs.ncs_purge_total.value.ui64++;
918 return (1);
920 mutex_exit(&hp->hash_lock);
921 } while (hp != end);
922 return (0);
926 * Utility routine to search for a cache entry. Return the
927 * ncache entry if found, NULL otherwise.
929 static ncache_t *
930 dnlc_search(vnode_t *dp, const char *name, uchar_t namlen, int hash)
932 nc_hash_t *hp;
933 ncache_t *ncp;
935 hp = &nc_hash[hash & nc_hashmask];
937 for (ncp = hp->hash_next; ncp != (ncache_t *)hp; ncp = ncp->hash_next) {
938 if (ncp->hash == hash &&
939 ncp->dp == dp &&
940 ncp->namlen == namlen &&
941 bcmp(ncp->name, name, namlen) == 0)
942 return (ncp);
944 return (NULL);
947 #if ((1 << NBBY) - 1) < (MAXNAMELEN - 1)
948 #error ncache_t name length representation is too small
949 #endif
951 void
952 dnlc_reduce_cache(void *reduce_percent)
954 if (dnlc_reduce_idle && (dnlc_nentries >= ncsize || reduce_percent)) {
955 dnlc_reduce_idle = 0;
956 if ((taskq_dispatch(system_taskq, do_dnlc_reduce_cache,
957 reduce_percent, TQ_NOSLEEP)) == NULL)
958 dnlc_reduce_idle = 1;
963 * Get a new name cache entry.
964 * If the dnlc_reduce_cache() taskq isn't keeping up with demand, or memory
965 * is short then just return NULL. If we're over ncsize then kick off a
966 * thread to free some in use entries down to dnlc_nentries_low_water.
967 * Caller must initialise all fields except namlen.
968 * Component names are defined to be less than MAXNAMELEN
969 * which includes a null.
971 static ncache_t *
972 dnlc_get(uchar_t namlen)
974 ncache_t *ncp;
976 if (dnlc_nentries > dnlc_max_nentries) {
977 dnlc_max_nentries_cnt++; /* keep a statistic */
978 return (NULL);
980 ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP);
981 if (ncp == NULL) {
982 return (NULL);
984 ncp->namlen = namlen;
985 atomic_inc_32(&dnlc_nentries);
986 dnlc_reduce_cache(NULL);
987 return (ncp);
991 * Taskq routine to free up name cache entries to reduce the
992 * cache size to the low water mark if "reduce_percent" is not provided.
993 * If "reduce_percent" is provided, reduce cache size by
994 * (ncsize_onepercent * reduce_percent).
996 /*ARGSUSED*/
997 static void
998 do_dnlc_reduce_cache(void *reduce_percent)
1000 nc_hash_t *hp = dnlc_free_rotor, *start_hp = hp;
1001 vnode_t *vp;
1002 ncache_t *ncp;
1003 int cnt;
1004 uint_t low_water = dnlc_nentries_low_water;
1006 if (reduce_percent) {
1007 uint_t reduce_cnt;
1010 * Never try to reduce the current number
1011 * of cache entries below 3% of ncsize.
1013 if (dnlc_nentries <= ncsize_min_percent) {
1014 dnlc_reduce_idle = 1;
1015 return;
1017 reduce_cnt = ncsize_onepercent *
1018 (uint_t)(uintptr_t)reduce_percent;
1020 if (reduce_cnt > dnlc_nentries ||
1021 dnlc_nentries - reduce_cnt < ncsize_min_percent)
1022 low_water = ncsize_min_percent;
1023 else
1024 low_water = dnlc_nentries - reduce_cnt;
1027 do {
1029 * Find the first non empty hash queue without locking.
1030 * Only look at each hash queue once to avoid an infinite loop.
1032 do {
1033 if (++hp == &nc_hash[nc_hashsz])
1034 hp = nc_hash;
1035 } while (hp->hash_next == (ncache_t *)hp && hp != start_hp);
1037 /* return if all hash queues are empty. */
1038 if (hp->hash_next == (ncache_t *)hp) {
1039 dnlc_reduce_idle = 1;
1040 return;
1043 mutex_enter(&hp->hash_lock);
1044 for (cnt = 0, ncp = hp->hash_prev; ncp != (ncache_t *)hp;
1045 ncp = ncp->hash_prev, cnt++) {
1046 vp = ncp->vp;
1048 * A name cache entry with a reference count
1049 * of one is only referenced by the dnlc.
1050 * Also negative cache entries are purged first.
1052 if (!vn_has_cached_data(vp) &&
1053 ((vp->v_count == 1) || (vp == DNLC_NO_VNODE))) {
1054 ncs.ncs_pick_heur.value.ui64++;
1055 goto found;
1058 * Remove from the end of the chain if the
1059 * chain is too long
1061 if (cnt > dnlc_long_chain) {
1062 ncp = hp->hash_prev;
1063 ncs.ncs_pick_last.value.ui64++;
1064 vp = ncp->vp;
1065 goto found;
1068 /* check for race and continue */
1069 if (hp->hash_next == (ncache_t *)hp) {
1070 mutex_exit(&hp->hash_lock);
1071 continue;
1074 ncp = hp->hash_prev; /* pick the last one in the hash queue */
1075 ncs.ncs_pick_last.value.ui64++;
1076 vp = ncp->vp;
1077 found:
1079 * Remove from hash chain.
1081 nc_rmhash(ncp);
1082 mutex_exit(&hp->hash_lock);
1083 VN_RELE_DNLC(vp);
1084 VN_RELE_DNLC(ncp->dp);
1085 dnlc_free(ncp);
1086 } while (dnlc_nentries > low_water);
1088 dnlc_free_rotor = hp;
1089 dnlc_reduce_idle = 1;
1093 * Directory caching routines
1094 * ==========================
1096 * See dnlc.h for details of the interfaces below.
1100 * Lookup up an entry in a complete or partial directory cache.
1102 dcret_t
1103 dnlc_dir_lookup(dcanchor_t *dcap, const char *name, uint64_t *handle)
1105 dircache_t *dcp;
1106 dcentry_t *dep;
1107 int hash;
1108 int ret;
1109 uchar_t namlen;
1112 * can test without lock as we are only a cache
1114 if (!VALID_DIR_CACHE(dcap->dca_dircache)) {
1115 ncs.ncs_dir_misses.value.ui64++;
1116 return (DNOCACHE);
1119 if (!dnlc_dir_enable) {
1120 return (DNOCACHE);
1123 mutex_enter(&dcap->dca_lock);
1124 dcp = (dircache_t *)dcap->dca_dircache;
1125 if (VALID_DIR_CACHE(dcp)) {
1126 dcp->dc_actime = ddi_get_lbolt64();
1127 DNLC_DIR_HASH(name, hash, namlen);
1128 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1129 while (dep != NULL) {
1130 if ((dep->de_hash == hash) &&
1131 (namlen == dep->de_namelen) &&
1132 bcmp(dep->de_name, name, namlen) == 0) {
1133 *handle = dep->de_handle;
1134 mutex_exit(&dcap->dca_lock);
1135 ncs.ncs_dir_hits.value.ui64++;
1136 return (DFOUND);
1138 dep = dep->de_next;
1140 if (dcp->dc_complete) {
1141 ret = DNOENT;
1142 } else {
1143 ret = DNOCACHE;
1145 mutex_exit(&dcap->dca_lock);
1146 return (ret);
1147 } else {
1148 mutex_exit(&dcap->dca_lock);
1149 ncs.ncs_dir_misses.value.ui64++;
1150 return (DNOCACHE);
1155 * Start a new directory cache. An estimate of the number of
1156 * entries is provided to as a quick check to ensure the directory
1157 * is cacheable.
1159 dcret_t
1160 dnlc_dir_start(dcanchor_t *dcap, uint_t num_entries)
1162 dircache_t *dcp;
1164 if (!dnlc_dir_enable ||
1165 (num_entries < dnlc_dir_min_size)) {
1166 return (DNOCACHE);
1169 if (num_entries > dnlc_dir_max_size) {
1170 return (DTOOBIG);
1173 mutex_enter(&dc_head.dch_lock);
1174 mutex_enter(&dcap->dca_lock);
1176 if (dcap->dca_dircache == DC_RET_LOW_MEM) {
1177 dcap->dca_dircache = NULL;
1178 mutex_exit(&dcap->dca_lock);
1179 mutex_exit(&dc_head.dch_lock);
1180 return (DNOMEM);
1184 * Check if there's currently a cache.
1185 * This probably only occurs on a race.
1187 if (dcap->dca_dircache != NULL) {
1188 mutex_exit(&dcap->dca_lock);
1189 mutex_exit(&dc_head.dch_lock);
1190 return (DNOCACHE);
1194 * Allocate the dircache struct, entry and free space hash tables.
1195 * These tables are initially just one entry but dynamically resize
1196 * when entries and free space are added or removed.
1198 if ((dcp = kmem_zalloc(sizeof (dircache_t), KM_NOSLEEP)) == NULL) {
1199 goto error;
1201 if ((dcp->dc_namehash = kmem_zalloc(sizeof (dcentry_t *),
1202 KM_NOSLEEP)) == NULL) {
1203 goto error;
1205 if ((dcp->dc_freehash = kmem_zalloc(sizeof (dcfree_t *),
1206 KM_NOSLEEP)) == NULL) {
1207 goto error;
1210 dcp->dc_anchor = dcap; /* set back pointer to anchor */
1211 dcap->dca_dircache = dcp;
1213 /* add into head of global chain */
1214 dcp->dc_next = dc_head.dch_next;
1215 dcp->dc_prev = (dircache_t *)&dc_head;
1216 dcp->dc_next->dc_prev = dcp;
1217 dc_head.dch_next = dcp;
1219 mutex_exit(&dcap->dca_lock);
1220 mutex_exit(&dc_head.dch_lock);
1221 ncs.ncs_cur_dirs.value.ui64++;
1222 ncs.ncs_dirs_cached.value.ui64++;
1223 return (DOK);
1224 error:
1225 if (dcp != NULL) {
1226 if (dcp->dc_namehash) {
1227 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *));
1229 kmem_free(dcp, sizeof (dircache_t));
1232 * Must also kmem_free dcp->dc_freehash if more error cases are added
1234 mutex_exit(&dcap->dca_lock);
1235 mutex_exit(&dc_head.dch_lock);
1236 ncs.ncs_dir_start_nm.value.ui64++;
1237 return (DNOCACHE);
1241 * Add a directopry entry to a partial or complete directory cache.
1243 dcret_t
1244 dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
1246 dircache_t *dcp;
1247 dcentry_t **hp, *dep;
1248 int hash;
1249 uint_t capacity;
1250 uchar_t namlen;
1253 * Allocate the dcentry struct, including the variable
1254 * size name. Note, the null terminator is not copied.
1256 * We do this outside the lock to avoid possible deadlock if
1257 * dnlc_dir_reclaim() is called as a result of memory shortage.
1259 DNLC_DIR_HASH(name, hash, namlen);
1260 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1261 if (dep == NULL) {
1262 #ifdef DEBUG
1264 * The kmem allocator generates random failures for
1265 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
1266 * So try again before we blow away a perfectly good cache.
1267 * This is done not to cover an error but purely for
1268 * performance running a debug kernel.
1269 * This random error only occurs in debug mode.
1271 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1272 if (dep != NULL)
1273 goto ok;
1274 #endif
1275 ncs.ncs_dir_add_nm.value.ui64++;
1277 * Free a directory cache. This may be the one we are
1278 * called with.
1280 dnlc_dir_reclaim(NULL);
1281 dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
1282 if (dep == NULL) {
1284 * still no memory, better delete this cache
1286 mutex_enter(&dcap->dca_lock);
1287 dcp = (dircache_t *)dcap->dca_dircache;
1288 if (VALID_DIR_CACHE(dcp)) {
1289 dnlc_dir_abort(dcp);
1290 dcap->dca_dircache = DC_RET_LOW_MEM;
1292 mutex_exit(&dcap->dca_lock);
1293 ncs.ncs_dir_addabort.value.ui64++;
1294 return (DNOCACHE);
1297 * fall through as if the 1st kmem_alloc had worked
1300 #ifdef DEBUG
1302 #endif
1303 mutex_enter(&dcap->dca_lock);
1304 dcp = (dircache_t *)dcap->dca_dircache;
1305 if (VALID_DIR_CACHE(dcp)) {
1307 * If the total number of entries goes above the max
1308 * then free this cache
1310 if ((dcp->dc_num_entries + dcp->dc_num_free) >
1311 dnlc_dir_max_size) {
1312 mutex_exit(&dcap->dca_lock);
1313 dnlc_dir_purge(dcap);
1314 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
1315 ncs.ncs_dir_add_max.value.ui64++;
1316 return (DTOOBIG);
1318 dcp->dc_num_entries++;
1319 capacity = (dcp->dc_nhash_mask + 1) << dnlc_dir_hash_size_shift;
1320 if (dcp->dc_num_entries >=
1321 (capacity << dnlc_dir_hash_resize_shift)) {
1322 dnlc_dir_adjust_nhash(dcp);
1324 hp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1327 * Initialise and chain in new entry
1329 dep->de_handle = handle;
1330 dep->de_hash = hash;
1332 * Note de_namelen is a uchar_t to conserve space
1333 * and alignment padding. The max length of any
1334 * pathname component is defined as MAXNAMELEN
1335 * which is 256 (including the terminating null).
1336 * So provided this doesn't change, we don't include the null,
1337 * we always use bcmp to compare strings, and we don't
1338 * start storing full names, then we are ok.
1339 * The space savings is worth it.
1341 dep->de_namelen = namlen;
1342 bcopy(name, dep->de_name, namlen);
1343 dep->de_next = *hp;
1344 *hp = dep;
1345 dcp->dc_actime = ddi_get_lbolt64();
1346 mutex_exit(&dcap->dca_lock);
1347 ncs.ncs_dir_num_ents.value.ui64++;
1348 return (DOK);
1349 } else {
1350 mutex_exit(&dcap->dca_lock);
1351 kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
1352 return (DNOCACHE);
1357 * Add free space to a partial or complete directory cache.
1359 dcret_t
1360 dnlc_dir_add_space(dcanchor_t *dcap, uint_t len, uint64_t handle)
1362 dircache_t *dcp;
1363 dcfree_t *dfp, **hp;
1364 uint_t capacity;
1367 * We kmem_alloc outside the lock to avoid possible deadlock if
1368 * dnlc_dir_reclaim() is called as a result of memory shortage.
1370 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1371 if (dfp == NULL) {
1372 #ifdef DEBUG
1374 * The kmem allocator generates random failures for
1375 * KM_NOSLEEP calls (see KMEM_RANDOM_ALLOCATION_FAILURE)
1376 * So try again before we blow away a perfectly good cache.
1377 * This random error only occurs in debug mode
1379 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1380 if (dfp != NULL)
1381 goto ok;
1382 #endif
1383 ncs.ncs_dir_add_nm.value.ui64++;
1385 * Free a directory cache. This may be the one we are
1386 * called with.
1388 dnlc_dir_reclaim(NULL);
1389 dfp = kmem_cache_alloc(dnlc_dir_space_cache, KM_NOSLEEP);
1390 if (dfp == NULL) {
1392 * still no memory, better delete this cache
1394 mutex_enter(&dcap->dca_lock);
1395 dcp = (dircache_t *)dcap->dca_dircache;
1396 if (VALID_DIR_CACHE(dcp)) {
1397 dnlc_dir_abort(dcp);
1398 dcap->dca_dircache = DC_RET_LOW_MEM;
1400 mutex_exit(&dcap->dca_lock);
1401 ncs.ncs_dir_addabort.value.ui64++;
1402 return (DNOCACHE);
1405 * fall through as if the 1st kmem_alloc had worked
1409 #ifdef DEBUG
1411 #endif
1412 mutex_enter(&dcap->dca_lock);
1413 dcp = (dircache_t *)dcap->dca_dircache;
1414 if (VALID_DIR_CACHE(dcp)) {
1415 if ((dcp->dc_num_entries + dcp->dc_num_free) >
1416 dnlc_dir_max_size) {
1417 mutex_exit(&dcap->dca_lock);
1418 dnlc_dir_purge(dcap);
1419 kmem_cache_free(dnlc_dir_space_cache, dfp);
1420 ncs.ncs_dir_add_max.value.ui64++;
1421 return (DTOOBIG);
1423 dcp->dc_num_free++;
1424 capacity = (dcp->dc_fhash_mask + 1) << dnlc_dir_hash_size_shift;
1425 if (dcp->dc_num_free >=
1426 (capacity << dnlc_dir_hash_resize_shift)) {
1427 dnlc_dir_adjust_fhash(dcp);
1430 * Initialise and chain a new entry
1432 dfp->df_handle = handle;
1433 dfp->df_len = len;
1434 dcp->dc_actime = ddi_get_lbolt64();
1435 hp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
1436 dfp->df_next = *hp;
1437 *hp = dfp;
1438 mutex_exit(&dcap->dca_lock);
1439 ncs.ncs_dir_num_ents.value.ui64++;
1440 return (DOK);
1441 } else {
1442 mutex_exit(&dcap->dca_lock);
1443 kmem_cache_free(dnlc_dir_space_cache, dfp);
1444 return (DNOCACHE);
1449 * Mark a directory cache as complete.
1451 void
1452 dnlc_dir_complete(dcanchor_t *dcap)
1454 dircache_t *dcp;
1456 mutex_enter(&dcap->dca_lock);
1457 dcp = (dircache_t *)dcap->dca_dircache;
1458 if (VALID_DIR_CACHE(dcp)) {
1459 dcp->dc_complete = B_TRUE;
1461 mutex_exit(&dcap->dca_lock);
1465 * Internal routine to delete a partial or full directory cache.
1466 * No additional locking needed.
1468 static void
1469 dnlc_dir_abort(dircache_t *dcp)
1471 dcentry_t *dep, *nhp;
1472 dcfree_t *fep, *fhp;
1473 uint_t nhtsize = dcp->dc_nhash_mask + 1; /* name hash table size */
1474 uint_t fhtsize = dcp->dc_fhash_mask + 1; /* free hash table size */
1475 uint_t i;
1478 * Free up the cached name entries and hash table
1480 for (i = 0; i < nhtsize; i++) { /* for each hash bucket */
1481 nhp = dcp->dc_namehash[i];
1482 while (nhp != NULL) { /* for each chained entry */
1483 dep = nhp->de_next;
1484 kmem_free(nhp, sizeof (dcentry_t) - 1 +
1485 nhp->de_namelen);
1486 nhp = dep;
1489 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * nhtsize);
1492 * Free up the free space entries and hash table
1494 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
1495 fhp = dcp->dc_freehash[i];
1496 while (fhp != NULL) { /* for each chained entry */
1497 fep = fhp->df_next;
1498 kmem_cache_free(dnlc_dir_space_cache, fhp);
1499 fhp = fep;
1502 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * fhtsize);
1505 * Finally free the directory cache structure itself
1507 ncs.ncs_dir_num_ents.value.ui64 -= (dcp->dc_num_entries +
1508 dcp->dc_num_free);
1509 kmem_free(dcp, sizeof (dircache_t));
1510 ncs.ncs_cur_dirs.value.ui64--;
1514 * Remove a partial or complete directory cache
1516 void
1517 dnlc_dir_purge(dcanchor_t *dcap)
1519 dircache_t *dcp;
1521 mutex_enter(&dc_head.dch_lock);
1522 mutex_enter(&dcap->dca_lock);
1523 dcp = (dircache_t *)dcap->dca_dircache;
1524 if (!VALID_DIR_CACHE(dcp)) {
1525 mutex_exit(&dcap->dca_lock);
1526 mutex_exit(&dc_head.dch_lock);
1527 return;
1529 dcap->dca_dircache = NULL;
1531 * Unchain from global list
1533 dcp->dc_prev->dc_next = dcp->dc_next;
1534 dcp->dc_next->dc_prev = dcp->dc_prev;
1535 mutex_exit(&dcap->dca_lock);
1536 mutex_exit(&dc_head.dch_lock);
1537 dnlc_dir_abort(dcp);
1541 * Remove an entry from a complete or partial directory cache.
1542 * Return the handle if it's non null.
1544 dcret_t
1545 dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep)
1547 dircache_t *dcp;
1548 dcentry_t **prevpp, *te;
1549 uint_t capacity;
1550 int hash;
1551 int ret;
1552 uchar_t namlen;
1554 if (!dnlc_dir_enable) {
1555 return (DNOCACHE);
1558 mutex_enter(&dcap->dca_lock);
1559 dcp = (dircache_t *)dcap->dca_dircache;
1560 if (VALID_DIR_CACHE(dcp)) {
1561 dcp->dc_actime = ddi_get_lbolt64();
1562 if (dcp->dc_nhash_mask > 0) { /* ie not minimum */
1563 capacity = (dcp->dc_nhash_mask + 1) <<
1564 dnlc_dir_hash_size_shift;
1565 if (dcp->dc_num_entries <=
1566 (capacity >> dnlc_dir_hash_resize_shift)) {
1567 dnlc_dir_adjust_nhash(dcp);
1570 DNLC_DIR_HASH(name, hash, namlen);
1571 prevpp = &dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1572 while (*prevpp != NULL) {
1573 if (((*prevpp)->de_hash == hash) &&
1574 (namlen == (*prevpp)->de_namelen) &&
1575 bcmp((*prevpp)->de_name, name, namlen) == 0) {
1576 if (handlep != NULL) {
1577 *handlep = (*prevpp)->de_handle;
1579 te = *prevpp;
1580 *prevpp = (*prevpp)->de_next;
1581 kmem_free(te, sizeof (dcentry_t) - 1 +
1582 te->de_namelen);
1585 * If the total number of entries
1586 * falls below half the minimum number
1587 * of entries then free this cache.
1589 if (--dcp->dc_num_entries <
1590 (dnlc_dir_min_size >> 1)) {
1591 mutex_exit(&dcap->dca_lock);
1592 dnlc_dir_purge(dcap);
1593 } else {
1594 mutex_exit(&dcap->dca_lock);
1596 ncs.ncs_dir_num_ents.value.ui64--;
1597 return (DFOUND);
1599 prevpp = &((*prevpp)->de_next);
1601 if (dcp->dc_complete) {
1602 ncs.ncs_dir_reme_fai.value.ui64++;
1603 ret = DNOENT;
1604 } else {
1605 ret = DNOCACHE;
1607 mutex_exit(&dcap->dca_lock);
1608 return (ret);
1609 } else {
1610 mutex_exit(&dcap->dca_lock);
1611 return (DNOCACHE);
1617 * Remove free space of at least the given length from a complete
1618 * or partial directory cache.
1620 dcret_t
1621 dnlc_dir_rem_space_by_len(dcanchor_t *dcap, uint_t len, uint64_t *handlep)
1623 dircache_t *dcp;
1624 dcfree_t **prevpp, *tfp;
1625 uint_t fhtsize; /* free hash table size */
1626 uint_t i;
1627 uint_t capacity;
1628 int ret;
1630 if (!dnlc_dir_enable) {
1631 return (DNOCACHE);
1634 mutex_enter(&dcap->dca_lock);
1635 dcp = (dircache_t *)dcap->dca_dircache;
1636 if (VALID_DIR_CACHE(dcp)) {
1637 dcp->dc_actime = ddi_get_lbolt64();
1638 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
1639 capacity = (dcp->dc_fhash_mask + 1) <<
1640 dnlc_dir_hash_size_shift;
1641 if (dcp->dc_num_free <=
1642 (capacity >> dnlc_dir_hash_resize_shift)) {
1643 dnlc_dir_adjust_fhash(dcp);
1647 * Search for an entry of the appropriate size
1648 * on a first fit basis.
1650 fhtsize = dcp->dc_fhash_mask + 1;
1651 for (i = 0; i < fhtsize; i++) { /* for each hash bucket */
1652 prevpp = &(dcp->dc_freehash[i]);
1653 while (*prevpp != NULL) {
1654 if ((*prevpp)->df_len >= len) {
1655 *handlep = (*prevpp)->df_handle;
1656 tfp = *prevpp;
1657 *prevpp = (*prevpp)->df_next;
1658 dcp->dc_num_free--;
1659 mutex_exit(&dcap->dca_lock);
1660 kmem_cache_free(dnlc_dir_space_cache,
1661 tfp);
1662 ncs.ncs_dir_num_ents.value.ui64--;
1663 return (DFOUND);
1665 prevpp = &((*prevpp)->df_next);
1668 if (dcp->dc_complete) {
1669 ret = DNOENT;
1670 } else {
1671 ret = DNOCACHE;
1673 mutex_exit(&dcap->dca_lock);
1674 return (ret);
1675 } else {
1676 mutex_exit(&dcap->dca_lock);
1677 return (DNOCACHE);
1682 * Remove free space with the given handle from a complete or partial
1683 * directory cache.
1685 dcret_t
1686 dnlc_dir_rem_space_by_handle(dcanchor_t *dcap, uint64_t handle)
1688 dircache_t *dcp;
1689 dcfree_t **prevpp, *tfp;
1690 uint_t capacity;
1691 int ret;
1693 if (!dnlc_dir_enable) {
1694 return (DNOCACHE);
1697 mutex_enter(&dcap->dca_lock);
1698 dcp = (dircache_t *)dcap->dca_dircache;
1699 if (VALID_DIR_CACHE(dcp)) {
1700 dcp->dc_actime = ddi_get_lbolt64();
1701 if (dcp->dc_fhash_mask > 0) { /* ie not minimum */
1702 capacity = (dcp->dc_fhash_mask + 1) <<
1703 dnlc_dir_hash_size_shift;
1704 if (dcp->dc_num_free <=
1705 (capacity >> dnlc_dir_hash_resize_shift)) {
1706 dnlc_dir_adjust_fhash(dcp);
1711 * search for the exact entry
1713 prevpp = &(dcp->dc_freehash[DDFHASH(handle, dcp)]);
1714 while (*prevpp != NULL) {
1715 if ((*prevpp)->df_handle == handle) {
1716 tfp = *prevpp;
1717 *prevpp = (*prevpp)->df_next;
1718 dcp->dc_num_free--;
1719 mutex_exit(&dcap->dca_lock);
1720 kmem_cache_free(dnlc_dir_space_cache, tfp);
1721 ncs.ncs_dir_num_ents.value.ui64--;
1722 return (DFOUND);
1724 prevpp = &((*prevpp)->df_next);
1726 if (dcp->dc_complete) {
1727 ncs.ncs_dir_rems_fai.value.ui64++;
1728 ret = DNOENT;
1729 } else {
1730 ret = DNOCACHE;
1732 mutex_exit(&dcap->dca_lock);
1733 return (ret);
1734 } else {
1735 mutex_exit(&dcap->dca_lock);
1736 return (DNOCACHE);
1741 * Update the handle of an directory cache entry.
1743 dcret_t
1744 dnlc_dir_update(dcanchor_t *dcap, const char *name, uint64_t handle)
1746 dircache_t *dcp;
1747 dcentry_t *dep;
1748 int hash;
1749 int ret;
1750 uchar_t namlen;
1752 if (!dnlc_dir_enable) {
1753 return (DNOCACHE);
1756 mutex_enter(&dcap->dca_lock);
1757 dcp = (dircache_t *)dcap->dca_dircache;
1758 if (VALID_DIR_CACHE(dcp)) {
1759 dcp->dc_actime = ddi_get_lbolt64();
1760 DNLC_DIR_HASH(name, hash, namlen);
1761 dep = dcp->dc_namehash[hash & dcp->dc_nhash_mask];
1762 while (dep != NULL) {
1763 if ((dep->de_hash == hash) &&
1764 (namlen == dep->de_namelen) &&
1765 bcmp(dep->de_name, name, namlen) == 0) {
1766 dep->de_handle = handle;
1767 mutex_exit(&dcap->dca_lock);
1768 return (DFOUND);
1770 dep = dep->de_next;
1772 if (dcp->dc_complete) {
1773 ncs.ncs_dir_upd_fail.value.ui64++;
1774 ret = DNOENT;
1775 } else {
1776 ret = DNOCACHE;
1778 mutex_exit(&dcap->dca_lock);
1779 return (ret);
1780 } else {
1781 mutex_exit(&dcap->dca_lock);
1782 return (DNOCACHE);
1786 void
1787 dnlc_dir_fini(dcanchor_t *dcap)
1789 dircache_t *dcp;
1791 mutex_enter(&dc_head.dch_lock);
1792 mutex_enter(&dcap->dca_lock);
1793 dcp = (dircache_t *)dcap->dca_dircache;
1794 if (VALID_DIR_CACHE(dcp)) {
1796 * Unchain from global list
1798 ncs.ncs_dir_finipurg.value.ui64++;
1799 dcp->dc_prev->dc_next = dcp->dc_next;
1800 dcp->dc_next->dc_prev = dcp->dc_prev;
1801 } else {
1802 dcp = NULL;
1804 dcap->dca_dircache = NULL;
1805 mutex_exit(&dcap->dca_lock);
1806 mutex_exit(&dc_head.dch_lock);
1807 mutex_destroy(&dcap->dca_lock);
1808 if (dcp) {
1809 dnlc_dir_abort(dcp);
1814 * Reclaim callback for dnlc directory caching.
1815 * Invoked by the kernel memory allocator when memory gets tight.
1816 * This is a pretty serious condition and can lead easily lead to system
1817 * hangs if not enough space is returned.
1819 * Deciding which directory (or directories) to purge is tricky.
1820 * Purging everything is an overkill, but purging just the oldest used
1821 * was found to lead to hangs. The largest cached directories use the
1822 * most memory, but take the most effort to rebuild, whereas the smaller
1823 * ones have little value and give back little space. So what to do?
1825 * The current policy is to continue purging the oldest used directories
1826 * until at least dnlc_dir_min_reclaim directory entries have been purged.
1828 /*ARGSUSED*/
1829 static void
1830 dnlc_dir_reclaim(void *unused)
1832 dircache_t *dcp, *oldest;
1833 uint_t dirent_cnt = 0;
1835 mutex_enter(&dc_head.dch_lock);
1836 while (dirent_cnt < dnlc_dir_min_reclaim) {
1837 dcp = dc_head.dch_next;
1838 oldest = NULL;
1839 while (dcp != (dircache_t *)&dc_head) {
1840 if (oldest == NULL) {
1841 oldest = dcp;
1842 } else {
1843 if (dcp->dc_actime < oldest->dc_actime) {
1844 oldest = dcp;
1847 dcp = dcp->dc_next;
1849 if (oldest == NULL) {
1850 /* nothing to delete */
1851 mutex_exit(&dc_head.dch_lock);
1852 return;
1855 * remove from directory chain and purge
1857 oldest->dc_prev->dc_next = oldest->dc_next;
1858 oldest->dc_next->dc_prev = oldest->dc_prev;
1859 mutex_enter(&oldest->dc_anchor->dca_lock);
1861 * If this was the last entry then it must be too large.
1862 * Mark it as such by saving a special dircache_t
1863 * pointer (DC_RET_LOW_MEM) in the anchor. The error DNOMEM
1864 * will be presented to the caller of dnlc_dir_start()
1866 if (oldest->dc_next == oldest->dc_prev) {
1867 oldest->dc_anchor->dca_dircache = DC_RET_LOW_MEM;
1868 ncs.ncs_dir_rec_last.value.ui64++;
1869 } else {
1870 oldest->dc_anchor->dca_dircache = NULL;
1871 ncs.ncs_dir_recl_any.value.ui64++;
1873 mutex_exit(&oldest->dc_anchor->dca_lock);
1874 dirent_cnt += oldest->dc_num_entries;
1875 dnlc_dir_abort(oldest);
1877 mutex_exit(&dc_head.dch_lock);
1881 * Dynamically grow or shrink the size of the name hash table
1883 static void
1884 dnlc_dir_adjust_nhash(dircache_t *dcp)
1886 dcentry_t **newhash, *dep, **nhp, *tep;
1887 uint_t newsize;
1888 uint_t oldsize;
1889 uint_t newsizemask;
1890 int i;
1893 * Allocate new hash table
1895 newsize = dcp->dc_num_entries >> dnlc_dir_hash_size_shift;
1896 newhash = kmem_zalloc(sizeof (dcentry_t *) * newsize, KM_NOSLEEP);
1897 if (newhash == NULL) {
1899 * System is short on memory just return
1900 * Note, the old hash table is still usable.
1901 * This return is unlikely to repeatedy occur, because
1902 * either some other directory caches will be reclaimed
1903 * due to memory shortage, thus freeing memory, or this
1904 * directory cahe will be reclaimed.
1906 return;
1908 oldsize = dcp->dc_nhash_mask + 1;
1909 dcp->dc_nhash_mask = newsizemask = newsize - 1;
1912 * Move entries from the old table to the new
1914 for (i = 0; i < oldsize; i++) { /* for each hash bucket */
1915 dep = dcp->dc_namehash[i];
1916 while (dep != NULL) { /* for each chained entry */
1917 tep = dep;
1918 dep = dep->de_next;
1919 nhp = &newhash[tep->de_hash & newsizemask];
1920 tep->de_next = *nhp;
1921 *nhp = tep;
1926 * delete old hash table and set new one in place
1928 kmem_free(dcp->dc_namehash, sizeof (dcentry_t *) * oldsize);
1929 dcp->dc_namehash = newhash;
1933 * Dynamically grow or shrink the size of the free space hash table
1935 static void
1936 dnlc_dir_adjust_fhash(dircache_t *dcp)
1938 dcfree_t **newhash, *dfp, **nhp, *tfp;
1939 uint_t newsize;
1940 uint_t oldsize;
1941 int i;
1944 * Allocate new hash table
1946 newsize = dcp->dc_num_free >> dnlc_dir_hash_size_shift;
1947 newhash = kmem_zalloc(sizeof (dcfree_t *) * newsize, KM_NOSLEEP);
1948 if (newhash == NULL) {
1950 * System is short on memory just return
1951 * Note, the old hash table is still usable.
1952 * This return is unlikely to repeatedy occur, because
1953 * either some other directory caches will be reclaimed
1954 * due to memory shortage, thus freeing memory, or this
1955 * directory cahe will be reclaimed.
1957 return;
1959 oldsize = dcp->dc_fhash_mask + 1;
1960 dcp->dc_fhash_mask = newsize - 1;
1963 * Move entries from the old table to the new
1965 for (i = 0; i < oldsize; i++) { /* for each hash bucket */
1966 dfp = dcp->dc_freehash[i];
1967 while (dfp != NULL) { /* for each chained entry */
1968 tfp = dfp;
1969 dfp = dfp->df_next;
1970 nhp = &newhash[DDFHASH(tfp->df_handle, dcp)];
1971 tfp->df_next = *nhp;
1972 *nhp = tfp;
1977 * delete old hash table and set new one in place
1979 kmem_free(dcp->dc_freehash, sizeof (dcfree_t *) * oldsize);
1980 dcp->dc_freehash = newhash;