1 mbcache: decouple the locking of local from global data
3 From: T Makphaibulchoke <tmac@hp.com>
5 The patch increases the parallelism of mb_cache_entry utilization by
6 replacing list_head with hlist_bl_node for the implementation of both the
7 block and index hash tables. Each hlist_bl_node contains a built-in lock
8 used to protect mb_cache's local block and index hash chains. The global
9 data mb_cache_lru_list and mb_cache_list continue to be protected by the
10 global mb_cache_spinlock.
12 Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
13 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
15 fs/mbcache.c | 308 +++++++++++++++++++++++++++++++++++++++++++++------------------
16 include/linux/mbcache.h | 10 ++-
17 2 files changed, 229 insertions(+), 89 deletions(-)
19 diff --git a/fs/mbcache.c b/fs/mbcache.c
20 index e519e45..44e7153 100644
24 * back on the lru list.
28 + * Lock descriptions and usage:
30 + * Each hash chain of both the block and index hash tables now contains
31 + * a built-in lock used to serialize accesses to the hash chain.
33 + * Accesses to global data structures mb_cache_list and mb_cache_lru_list
34 + * are serialized via the global spinlock mb_cache_spinlock.
38 + * Each block hash chain's lock has the highest order, followed by each
39 + * index hash chain's lock, with mb_cache_spinlock the lowest.
40 + * While holding a block hash chain lock a thread can acquire either
41 + * an index hash chain lock or mb_cache_spinlock.
45 + * Since both the e_used and e_queued members of each mb_cache_entry can
46 + * be updated while traversing either a block hash chain or an index hash
47 + * chain and the index hash chain lock has lower oder, each index hash
48 + * chain's lock, in addition to being used to serialize accesses to the
49 + * index hash chain, is also used to serialize accesses to both e_used
52 + * To avoid having a dangling reference to an already freed
53 + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
54 + * block hash chain and also no longer being referenced. Both e_used
55 + * and e_queued are 0's. When an mb_cache_entry is explicitly
56 + * freed it is first removed from a block hash chain.
59 #include <linux/kernel.h>
60 #include <linux/module.h>
63 #include <linux/slab.h>
64 #include <linux/sched.h>
65 #include <linux/init.h>
66 +#include <linux/list_bl.h>
67 #include <linux/mbcache.h>
71 # define mb_debug(f...) do { \
72 printk(KERN_DEBUG f); \
73 @@ -87,23 +119,34 @@ static LIST_HEAD(mb_cache_lru_list);
74 static DEFINE_SPINLOCK(mb_cache_spinlock);
77 -__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
78 +__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
80 - return !list_empty(&ce->e_block_list);
81 + return !hlist_bl_unhashed(&ce->e_block_list);
86 -__mb_cache_entry_unhash(struct mb_cache_entry *ce)
87 +__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
89 - if (__mb_cache_entry_is_hashed(ce)) {
90 - list_del_init(&ce->e_block_list);
91 - list_del(&ce->e_index.o_list);
93 + if (__mb_cache_entry_is_block_hashed(ce))
94 + hlist_bl_del_init(&ce->e_block_list);
98 +__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
100 + return !hlist_bl_unhashed(&ce->e_index.o_list);
105 +__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
107 + if (__mb_cache_entry_is_index_hashed(ce))
108 + hlist_bl_del(&ce->e_index.o_list);
112 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
114 struct mb_cache *cache = ce->e_cache;
115 @@ -116,8 +159,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
118 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
119 - __releases(mb_cache_spinlock)
121 + hlist_bl_lock(ce->e_index_hash_p);
122 /* Wake up all processes queuing for this cache entry. */
124 wake_up_all(&mb_cache_queue);
125 @@ -125,15 +168,20 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
126 ce->e_used -= MB_CACHE_WRITER;
128 if (!(ce->e_used || ce->e_queued)) {
129 - if (!__mb_cache_entry_is_hashed(ce))
130 + hlist_bl_unlock(ce->e_index_hash_p);
131 + if (!__mb_cache_entry_is_block_hashed(ce))
133 + spin_lock(&mb_cache_spinlock);
134 mb_assert(list_empty(&ce->e_lru_list));
135 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
137 - spin_unlock(&mb_cache_spinlock);
138 + spin_unlock(&mb_cache_spinlock);
140 + hlist_bl_unlock(ce->e_index_hash_p);
141 + hlist_bl_unlock(ce->e_block_hash_p);
144 - spin_unlock(&mb_cache_spinlock);
145 + hlist_bl_unlock(ce->e_block_hash_p);
146 + mb_assert(list_empty(&ce->e_lru_list));
147 __mb_cache_entry_forget(ce, GFP_KERNEL);
150 @@ -152,25 +200,38 @@ forget:
152 mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
154 - LIST_HEAD(free_list);
155 - struct mb_cache_entry *entry, *tmp;
156 int nr_to_scan = sc->nr_to_scan;
157 gfp_t gfp_mask = sc->gfp_mask;
158 unsigned long freed = 0;
160 mb_debug("trying to free %d entries", nr_to_scan);
161 - spin_lock(&mb_cache_spinlock);
162 - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
163 - struct mb_cache_entry *ce =
164 - list_entry(mb_cache_lru_list.next,
165 - struct mb_cache_entry, e_lru_list);
166 - list_move_tail(&ce->e_lru_list, &free_list);
167 - __mb_cache_entry_unhash(ce);
170 - spin_unlock(&mb_cache_spinlock);
171 - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
172 - __mb_cache_entry_forget(entry, gfp_mask);
173 + while (nr_to_scan > 0) {
174 + struct mb_cache_entry *ce;
176 + spin_lock(&mb_cache_spinlock);
177 + if (list_empty(&mb_cache_lru_list)) {
178 + spin_unlock(&mb_cache_spinlock);
181 + ce = list_entry(mb_cache_lru_list.next,
182 + struct mb_cache_entry, e_lru_list);
183 + list_del_init(&ce->e_lru_list);
184 + spin_unlock(&mb_cache_spinlock);
186 + hlist_bl_lock(ce->e_block_hash_p);
187 + hlist_bl_lock(ce->e_index_hash_p);
188 + if (!(ce->e_used || ce->e_queued)) {
189 + __mb_cache_entry_unhash_index(ce);
190 + hlist_bl_unlock(ce->e_index_hash_p);
191 + __mb_cache_entry_unhash_block(ce);
192 + hlist_bl_unlock(ce->e_block_hash_p);
193 + __mb_cache_entry_forget(ce, gfp_mask);
197 + hlist_bl_unlock(ce->e_index_hash_p);
198 + hlist_bl_unlock(ce->e_block_hash_p);
203 @@ -221,18 +282,18 @@ mb_cache_create(const char *name, int bucket_bits)
204 cache->c_name = name;
205 atomic_set(&cache->c_entry_count, 0);
206 cache->c_bucket_bits = bucket_bits;
207 - cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
209 + cache->c_block_hash = kmalloc(bucket_count *
210 + sizeof(struct hlist_bl_head), GFP_KERNEL);
211 if (!cache->c_block_hash)
213 for (n=0; n<bucket_count; n++)
214 - INIT_LIST_HEAD(&cache->c_block_hash[n]);
215 - cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
217 + INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
218 + cache->c_index_hash = kmalloc(bucket_count *
219 + sizeof(struct hlist_bl_head), GFP_KERNEL);
220 if (!cache->c_index_hash)
222 for (n=0; n<bucket_count; n++)
223 - INIT_LIST_HEAD(&cache->c_index_hash[n]);
224 + INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
225 cache->c_entry_cache = kmem_cache_create(name,
226 sizeof(struct mb_cache_entry), 0,
227 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
228 @@ -281,13 +342,24 @@ mb_cache_shrink(struct block_device *bdev)
229 list_entry(l, struct mb_cache_entry, e_lru_list);
230 if (ce->e_bdev == bdev) {
231 list_move_tail(&ce->e_lru_list, &free_list);
232 - __mb_cache_entry_unhash(ce);
235 spin_unlock(&mb_cache_spinlock);
236 list_for_each_safe(l, ltmp, &free_list) {
237 - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
238 - e_lru_list), GFP_KERNEL);
239 + struct mb_cache_entry *ce =
240 + list_entry(l, struct mb_cache_entry, e_lru_list);
241 + hlist_bl_lock(ce->e_block_hash_p);
242 + hlist_bl_lock(ce->e_index_hash_p);
243 + if (!(ce->e_used || ce->e_queued)) {
244 + __mb_cache_entry_unhash_index(ce);
245 + hlist_bl_unlock(ce->e_index_hash_p);
246 + __mb_cache_entry_unhash_block(ce);
247 + hlist_bl_unlock(ce->e_block_hash_p);
248 + __mb_cache_entry_forget(ce, GFP_KERNEL);
250 + hlist_bl_unlock(ce->e_index_hash_p);
251 + hlist_bl_unlock(ce->e_block_hash_p);
256 @@ -311,15 +383,21 @@ mb_cache_destroy(struct mb_cache *cache)
257 list_entry(l, struct mb_cache_entry, e_lru_list);
258 if (ce->e_cache == cache) {
259 list_move_tail(&ce->e_lru_list, &free_list);
260 - __mb_cache_entry_unhash(ce);
263 list_del(&cache->c_cache_list);
264 spin_unlock(&mb_cache_spinlock);
266 list_for_each_safe(l, ltmp, &free_list) {
267 - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
268 - e_lru_list), GFP_KERNEL);
269 + struct mb_cache_entry *ce =
270 + list_entry(l, struct mb_cache_entry, e_lru_list);
271 + hlist_bl_lock(ce->e_block_hash_p);
272 + hlist_bl_lock(ce->e_index_hash_p);
273 + __mb_cache_entry_unhash_index(ce);
274 + hlist_bl_unlock(ce->e_index_hash_p);
275 + __mb_cache_entry_unhash_block(ce);
276 + hlist_bl_unlock(ce->e_block_hash_p);
277 + __mb_cache_entry_forget(ce, GFP_KERNEL);
280 if (atomic_read(&cache->c_entry_count) > 0) {
281 @@ -349,12 +427,27 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
282 struct mb_cache_entry *ce = NULL;
284 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
285 + struct list_head *l;
286 + struct list_head *ltmp;
288 spin_lock(&mb_cache_spinlock);
289 - if (!list_empty(&mb_cache_lru_list)) {
290 - ce = list_entry(mb_cache_lru_list.next,
291 - struct mb_cache_entry, e_lru_list);
292 - list_del_init(&ce->e_lru_list);
293 - __mb_cache_entry_unhash(ce);
294 + list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
295 + ce = list_entry(l, struct mb_cache_entry, e_lru_list);
296 + if (ce->e_cache == cache) {
297 + list_del_init(&ce->e_lru_list);
298 + spin_unlock(&mb_cache_spinlock);
299 + hlist_bl_lock(ce->e_block_hash_p);
300 + hlist_bl_lock(ce->e_index_hash_p);
301 + if (!(ce->e_used || ce->e_queued)) {
302 + __mb_cache_entry_unhash_index(ce);
303 + hlist_bl_unlock(ce->e_index_hash_p);
304 + __mb_cache_entry_unhash_block(ce);
305 + hlist_bl_unlock(ce->e_block_hash_p);
308 + hlist_bl_unlock(ce->e_index_hash_p);
309 + hlist_bl_unlock(ce->e_block_hash_p);
312 spin_unlock(&mb_cache_spinlock);
314 @@ -364,9 +457,12 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
316 atomic_inc(&cache->c_entry_count);
317 INIT_LIST_HEAD(&ce->e_lru_list);
318 - INIT_LIST_HEAD(&ce->e_block_list);
319 + INIT_HLIST_BL_NODE(&ce->e_block_list);
320 + INIT_HLIST_BL_NODE(&ce->e_index.o_list);
323 + ce->e_block_hash_p = &cache->c_block_hash[0];
324 + ce->e_index_hash_p = &cache->c_index_hash[0];
326 ce->e_used = 1 + MB_CACHE_WRITER;
328 @@ -393,28 +489,37 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
330 struct mb_cache *cache = ce->e_cache;
332 - struct list_head *l;
333 + struct hlist_bl_node *l;
335 + struct hlist_bl_head *block_hash_p;
336 + struct hlist_bl_head *index_hash_p;
337 + struct mb_cache_entry *lce;
339 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
340 cache->c_bucket_bits);
341 - spin_lock(&mb_cache_spinlock);
342 - list_for_each_prev(l, &cache->c_block_hash[bucket]) {
343 - struct mb_cache_entry *ce =
344 - list_entry(l, struct mb_cache_entry, e_block_list);
345 - if (ce->e_bdev == bdev && ce->e_block == block)
346 + block_hash_p = &cache->c_block_hash[bucket];
347 + hlist_bl_lock(block_hash_p);
348 + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
349 + if (lce->e_bdev == bdev && lce->e_block == block)
352 - __mb_cache_entry_unhash(ce);
353 + mb_assert(!__mb_cache_entry_is_hashed(ce));
354 + __mb_cache_entry_unhash_index(ce);
355 + __mb_cache_entry_unhash_block(ce);
358 - list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
359 + hlist_bl_add_head(&ce->e_block_list, block_hash_p);
360 + ce->e_block_hash_p = block_hash_p;
361 ce->e_index.o_key = key;
362 bucket = hash_long(key, cache->c_bucket_bits);
363 - list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
364 + index_hash_p = &cache->c_index_hash[bucket];
365 + hlist_bl_lock(index_hash_p);
366 + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
367 + ce->e_index_hash_p = index_hash_p;
368 + hlist_bl_unlock(index_hash_p);
371 - spin_unlock(&mb_cache_spinlock);
372 + hlist_bl_unlock(block_hash_p);
376 @@ -429,7 +534,7 @@ out:
378 mb_cache_entry_release(struct mb_cache_entry *ce)
380 - spin_lock(&mb_cache_spinlock);
381 + hlist_bl_lock(ce->e_block_hash_p);
382 __mb_cache_entry_release_unlock(ce);
385 @@ -443,9 +548,13 @@ mb_cache_entry_release(struct mb_cache_entry *ce)
387 mb_cache_entry_free(struct mb_cache_entry *ce)
389 - spin_lock(&mb_cache_spinlock);
391 + hlist_bl_lock(ce->e_block_hash_p);
392 + hlist_bl_lock(ce->e_index_hash_p);
393 mb_assert(list_empty(&ce->e_lru_list));
394 - __mb_cache_entry_unhash(ce);
395 + __mb_cache_entry_unhash_index(ce);
396 + hlist_bl_unlock(ce->e_index_hash_p);
397 + __mb_cache_entry_unhash_block(ce);
398 __mb_cache_entry_release_unlock(ce);
401 @@ -463,33 +572,46 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
405 - struct list_head *l;
406 + struct hlist_bl_node *l;
407 struct mb_cache_entry *ce;
408 + struct hlist_bl_head *block_hash_p;
409 + int held_block_lock = 1;
411 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
412 cache->c_bucket_bits);
413 - spin_lock(&mb_cache_spinlock);
414 - list_for_each(l, &cache->c_block_hash[bucket]) {
415 - ce = list_entry(l, struct mb_cache_entry, e_block_list);
416 + block_hash_p = &cache->c_block_hash[bucket];
417 + hlist_bl_lock(block_hash_p);
418 + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
419 + mb_assert(ce->e_block_hash_p == block_hash_p);
420 if (ce->e_bdev == bdev && ce->e_block == block) {
423 + spin_lock(&mb_cache_spinlock);
424 if (!list_empty(&ce->e_lru_list))
425 list_del_init(&ce->e_lru_list);
426 + spin_unlock(&mb_cache_spinlock);
428 + hlist_bl_lock(ce->e_index_hash_p);
429 while (ce->e_used > 0) {
431 + hlist_bl_unlock(ce->e_index_hash_p);
432 prepare_to_wait(&mb_cache_queue, &wait,
433 TASK_UNINTERRUPTIBLE);
434 - spin_unlock(&mb_cache_spinlock);
435 + if (held_block_lock) {
436 + hlist_bl_unlock(block_hash_p);
437 + held_block_lock = 0;
440 - spin_lock(&mb_cache_spinlock);
441 + hlist_bl_lock(ce->e_index_hash_p);
444 - finish_wait(&mb_cache_queue, &wait);
445 ce->e_used += 1 + MB_CACHE_WRITER;
446 + hlist_bl_unlock(ce->e_index_hash_p);
447 + finish_wait(&mb_cache_queue, &wait);
449 - if (!__mb_cache_entry_is_hashed(ce)) {
450 + if (!held_block_lock)
451 + hlist_bl_lock(block_hash_p);
452 + if (!__mb_cache_entry_is_block_hashed(ce)) {
453 __mb_cache_entry_release_unlock(ce);
456 @@ -499,24 +621,27 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
460 - spin_unlock(&mb_cache_spinlock);
461 + hlist_bl_unlock(block_hash_p);
465 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
467 static struct mb_cache_entry *
468 -__mb_cache_entry_find(struct list_head *l, struct list_head *head,
469 +__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
470 struct block_device *bdev, unsigned int key)
472 - while (l != head) {
473 + while (l != NULL) {
474 struct mb_cache_entry *ce =
475 - list_entry(l, struct mb_cache_entry, e_index.o_list);
476 + hlist_bl_entry(l, struct mb_cache_entry,
478 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
481 + spin_lock(&mb_cache_spinlock);
482 if (!list_empty(&ce->e_lru_list))
483 list_del_init(&ce->e_lru_list);
484 + spin_unlock(&mb_cache_spinlock);
486 /* Incrementing before holding the lock gives readers
487 priority over writers. */
488 @@ -525,18 +650,23 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
490 prepare_to_wait(&mb_cache_queue, &wait,
491 TASK_UNINTERRUPTIBLE);
492 - spin_unlock(&mb_cache_spinlock);
493 + hlist_bl_unlock(head);
495 - spin_lock(&mb_cache_spinlock);
496 + hlist_bl_lock(head);
497 + mb_assert(ce->e_index_hash_p == head);
500 + hlist_bl_unlock(head);
501 finish_wait(&mb_cache_queue, &wait);
503 - if (!__mb_cache_entry_is_hashed(ce)) {
504 + hlist_bl_lock(ce->e_block_hash_p);
505 + if (!__mb_cache_entry_is_block_hashed(ce)) {
506 __mb_cache_entry_release_unlock(ce);
507 - spin_lock(&mb_cache_spinlock);
508 + hlist_bl_lock(head);
509 return ERR_PTR(-EAGAIN);
512 + hlist_bl_unlock(ce->e_block_hash_p);
513 + mb_assert(ce->e_index_hash_p == head);
517 @@ -562,13 +692,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
520 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
521 - struct list_head *l;
522 - struct mb_cache_entry *ce;
523 + struct hlist_bl_node *l;
524 + struct mb_cache_entry *ce = NULL;
525 + struct hlist_bl_head *index_hash_p;
527 - spin_lock(&mb_cache_spinlock);
528 - l = cache->c_index_hash[bucket].next;
529 - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
530 - spin_unlock(&mb_cache_spinlock);
531 + index_hash_p = &cache->c_index_hash[bucket];
532 + hlist_bl_lock(index_hash_p);
533 + if (!hlist_bl_empty(index_hash_p)) {
534 + l = hlist_bl_first(index_hash_p);
535 + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
537 + hlist_bl_unlock(index_hash_p);
541 @@ -597,13 +731,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
543 struct mb_cache *cache = prev->e_cache;
544 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
545 - struct list_head *l;
546 + struct hlist_bl_node *l;
547 struct mb_cache_entry *ce;
548 + struct hlist_bl_head *index_hash_p;
550 - spin_lock(&mb_cache_spinlock);
551 + index_hash_p = &cache->c_index_hash[bucket];
552 + mb_assert(prev->e_index_hash_p == index_hash_p);
553 + hlist_bl_lock(index_hash_p);
554 + mb_assert(!hlist_bl_empty(index_hash_p));
555 l = prev->e_index.o_list.next;
556 - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
557 - __mb_cache_entry_release_unlock(prev);
558 + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
559 + hlist_bl_unlock(index_hash_p);
563 diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
564 index 5525d37..89826c0 100644
565 --- a/include/linux/mbcache.h
566 +++ b/include/linux/mbcache.h
567 @@ -11,11 +11,13 @@ struct mb_cache_entry {
568 unsigned short e_queued;
569 struct block_device *e_bdev;
571 - struct list_head e_block_list;
572 + struct hlist_bl_node e_block_list;
574 - struct list_head o_list;
575 + struct hlist_bl_node o_list;
578 + struct hlist_bl_head *e_block_hash_p;
579 + struct hlist_bl_head *e_index_hash_p;
583 @@ -25,8 +27,8 @@ struct mb_cache {
586 struct kmem_cache *c_entry_cache;
587 - struct list_head *c_block_hash;
588 - struct list_head *c_index_hash;
589 + struct hlist_bl_head *c_block_hash;
590 + struct hlist_bl_head *c_index_hash;
593 /* Functions on caches */