update atomically-set-inode-flags
[ext4-patch-queue.git] / mbcache-decouple-locking-of-local-from-global-data
blobbc4aa14506394f97e5e3e25552d46fa61e205a72
1 fs/mbcache.c: doucple the locking of local from global data
3 From: T Makphaibulchoke <tmac@hp.com>
5 The patch increases the parallelism of mbcache by using the built-in
6 lock in the hlist_bl_node to protect the mb_cache's local block and
7 index hash chains.  The global data mb_cache_lru_list and
8 mb_cache_list continue to be protected by the global
9 mb_cache_spinlock.
11 New block group spinlock, mb_cache_bg_lock is also added to serialize
12 accesses to mb_cache_entry's local data.
14 A new member e_refcnt is added to the mb_cache_entry structure to help
15 preventing an mb_cache_entry from being deallocated by a free while it
16 is being referenced by either mb_cache_entry_get() or
17 mb_cache_entry_find().
19 Signed-off-by: T. Makphaibulchoke <tmac@hp.com>
20 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
21 ---
22  fs/mbcache.c | 417 ++++++++++++++++++++++++++++++++++++++++++-----------------
23  1 file changed, 301 insertions(+), 116 deletions(-)
25 diff --git a/fs/mbcache.c b/fs/mbcache.c
26 index 55db0da..786ecab 100644
27 --- a/fs/mbcache.c
28 +++ b/fs/mbcache.c
29 @@ -26,6 +26,41 @@
30   * back on the lru list.
31   */
33 +/*
34 + * Lock descriptions and usage:
35 + *
36 + * Each hash chain of both the block and index hash tables now contains
37 + * a built-in lock used to serialize accesses to the hash chain.
38 + *
39 + * Accesses to global data structures mb_cache_list and mb_cache_lru_list
40 + * are serialized via the global spinlock mb_cache_spinlock.
41 + *
42 + * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
43 + * accesses to its local data, such as e_used and e_queued.
44 + *
45 + * Lock ordering:
46 + *
47 + * Each block hash chain's lock has the highest lock order, followed by an
48 + * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
49 + * lock), and mb_cach_spinlock, with the lowest order.  While holding
50 + * either a block or index hash chain lock, a thread can acquire an
51 + * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
52 + *
53 + * Synchronization:
54 + *
55 + * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
56 + * index hash chian, it needs to lock the corresponding hash chain.  For each
57 + * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
58 + * prevent either any simultaneous release or free on the entry and also
59 + * to serialize accesses to either the e_used or e_queued member of the entry.
60 + *
61 + * To avoid having a dangling reference to an already freed
62 + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
63 + * block hash chain and also no longer being referenced, both e_used,
64 + * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
65 + * first removed from a block hash chain.
66 + */
68  #include <linux/kernel.h>
69  #include <linux/module.h>
71 @@ -37,6 +72,7 @@
72  #include <linux/list_bl.h>
73  #include <linux/mbcache.h>
74  #include <linux/init.h>
75 +#include <linux/blockgroup_lock.h>
77  #ifdef MB_CACHE_DEBUG
78  # define mb_debug(f...) do { \
79 @@ -57,8 +93,13 @@
81  #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
83 +#define MB_CACHE_ENTRY_LOCK_BITS       __builtin_log2(NR_BG_LOCKS)
84 +#define        MB_CACHE_ENTRY_LOCK_INDEX(ce)                   \
85 +       (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
87  static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
88 -               
89 +static struct blockgroup_lock *mb_cache_bg_lock;
91  MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
92  MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
93  MODULE_LICENSE("GPL");
94 @@ -86,6 +127,20 @@ static LIST_HEAD(mb_cache_list);
95  static LIST_HEAD(mb_cache_lru_list);
96  static DEFINE_SPINLOCK(mb_cache_spinlock);
98 +static inline void
99 +__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
101 +       spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
102 +               MB_CACHE_ENTRY_LOCK_INDEX(ce)));
105 +static inline void
106 +__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
108 +       spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
109 +               MB_CACHE_ENTRY_LOCK_INDEX(ce)));
112  static inline int
113  __mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
115 @@ -113,11 +168,21 @@ __mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
116                 hlist_bl_del_init(&ce->e_index.o_list);
120 + * __mb_cache_entry_unhash_unlock()
121 + *
122 + * This function is called to unhash both the block and index hash
123 + * chain.
124 + * It assumes both the block and index hash chain is locked upon entry.
125 + * It also unlock both hash chains both exit
126 + */
127  static inline void
128 -__mb_cache_entry_unhash(struct mb_cache_entry *ce)
129 +__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
131         __mb_cache_entry_unhash_index(ce);
132 +       hlist_bl_unlock(ce->e_index_hash_p);
133         __mb_cache_entry_unhash_block(ce);
134 +       hlist_bl_unlock(ce->e_block_hash_p);
137  static void
138 @@ -125,36 +190,47 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
140         struct mb_cache *cache = ce->e_cache;
142 -       mb_assert(!(ce->e_used || ce->e_queued));
143 +       mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
144         kmem_cache_free(cache->c_entry_cache, ce);
145         atomic_dec(&cache->c_entry_count);
149  static void
150 -__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
151 -       __releases(mb_cache_spinlock)
152 +__mb_cache_entry_release(struct mb_cache_entry *ce)
154 +       /* First lock the entry to serialize access to its local data. */
155 +       __spin_lock_mb_cache_entry(ce);
156         /* Wake up all processes queuing for this cache entry. */
157         if (ce->e_queued)
158                 wake_up_all(&mb_cache_queue);
159         if (ce->e_used >= MB_CACHE_WRITER)
160                 ce->e_used -= MB_CACHE_WRITER;
161 +       /*
162 +        * Make sure that all cache entries on lru_list have
163 +        * both e_used and e_qued of 0s.
164 +        */
165         ce->e_used--;
166 -       if (!(ce->e_used || ce->e_queued)) {
167 -               if (!__mb_cache_entry_is_block_hashed(ce))
168 +       if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
169 +               if (!__mb_cache_entry_is_block_hashed(ce)) {
170 +                       __spin_unlock_mb_cache_entry(ce);
171                         goto forget;
172 -               mb_assert(list_empty(&ce->e_lru_list));
173 -               list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
174 +               }
175 +               /*
176 +                * Need access to lru list, first drop entry lock,
177 +                * then reacquire the lock in the proper order.
178 +                */
179 +               spin_lock(&mb_cache_spinlock);
180 +               if (list_empty(&ce->e_lru_list))
181 +                       list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
182 +               spin_unlock(&mb_cache_spinlock);
183         }
184 -       spin_unlock(&mb_cache_spinlock);
185 +       __spin_unlock_mb_cache_entry(ce);
186         return;
187  forget:
188 -       spin_unlock(&mb_cache_spinlock);
189 +       mb_assert(list_empty(&ce->e_lru_list));
190         __mb_cache_entry_forget(ce, GFP_KERNEL);
194  /*
195   * mb_cache_shrink_scan()  memory pressure callback
196   *
197 @@ -177,17 +253,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
199         mb_debug("trying to free %d entries", nr_to_scan);
200         spin_lock(&mb_cache_spinlock);
201 -       while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
202 +       while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
203                 struct mb_cache_entry *ce =
204                         list_entry(mb_cache_lru_list.next,
205 -                                  struct mb_cache_entry, e_lru_list);
206 -               list_move_tail(&ce->e_lru_list, &free_list);
207 -               __mb_cache_entry_unhash(ce);
208 -               freed++;
209 +                               struct mb_cache_entry, e_lru_list);
210 +               list_del_init(&ce->e_lru_list);
211 +               if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
212 +                       continue;
213 +               spin_unlock(&mb_cache_spinlock);
214 +               /* Prevent any find or get operation on the entry */
215 +               hlist_bl_lock(ce->e_block_hash_p);
216 +               hlist_bl_lock(ce->e_index_hash_p);
217 +               /* Ignore if it is touched by a find/get */
218 +               if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
219 +                       !list_empty(&ce->e_lru_list)) {
220 +                       hlist_bl_unlock(ce->e_index_hash_p);
221 +                       hlist_bl_unlock(ce->e_block_hash_p);
222 +                       spin_lock(&mb_cache_spinlock);
223 +                       continue;
224 +               }
225 +               __mb_cache_entry_unhash_unlock(ce);
226 +               list_add_tail(&ce->e_lru_list, &free_list);
227 +               spin_lock(&mb_cache_spinlock);
228         }
229         spin_unlock(&mb_cache_spinlock);
231         list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
232                 __mb_cache_entry_forget(entry, gfp_mask);
233 +               freed++;
234         }
235         return freed;
237 @@ -232,6 +325,14 @@ mb_cache_create(const char *name, int bucket_bits)
238         int n, bucket_count = 1 << bucket_bits;
239         struct mb_cache *cache = NULL;
241 +       if (!mb_cache_bg_lock) {
242 +               mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
243 +                       GFP_KERNEL);
244 +               if (!mb_cache_bg_lock)
245 +                       return NULL;
246 +               bgl_lock_init(mb_cache_bg_lock);
247 +       }
249         cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
250         if (!cache)
251                 return NULL;
252 @@ -290,21 +391,47 @@ void
253  mb_cache_shrink(struct block_device *bdev)
255         LIST_HEAD(free_list);
256 -       struct list_head *l, *ltmp;
257 +       struct list_head *l;
258 +       struct mb_cache_entry *ce, *tmp;
260 +       l = &mb_cache_lru_list;
261         spin_lock(&mb_cache_spinlock);
262 -       list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
263 -               struct mb_cache_entry *ce =
264 -                       list_entry(l, struct mb_cache_entry, e_lru_list);
265 +       while (!list_is_last(l, &mb_cache_lru_list)) {
266 +               l = l->next;
267 +               ce = list_entry(l, struct mb_cache_entry, e_lru_list);
268                 if (ce->e_bdev == bdev) {
269 -                       list_move_tail(&ce->e_lru_list, &free_list);
270 -                       __mb_cache_entry_unhash(ce);
271 +                       list_del_init(&ce->e_lru_list);
272 +                       if (ce->e_used || ce->e_queued ||
273 +                               atomic_read(&ce->e_refcnt))
274 +                               continue;
275 +                       spin_unlock(&mb_cache_spinlock);
276 +                       /*
277 +                        * Prevent any find or get operation on the entry.
278 +                        */
279 +                       hlist_bl_lock(ce->e_block_hash_p);
280 +                       hlist_bl_lock(ce->e_index_hash_p);
281 +                       /* Ignore if it is touched by a find/get */
282 +                       if (ce->e_used || ce->e_queued ||
283 +                               atomic_read(&ce->e_refcnt) ||
284 +                               !list_empty(&ce->e_lru_list)) {
285 +                               hlist_bl_unlock(ce->e_index_hash_p);
286 +                               hlist_bl_unlock(ce->e_block_hash_p);
287 +                               l = &mb_cache_lru_list;
288 +                               spin_lock(&mb_cache_spinlock);
289 +                               continue;
290 +                       }
291 +                       __mb_cache_entry_unhash_unlock(ce);
292 +                       mb_assert(!(ce->e_used || ce->e_queued ||
293 +                               atomic_read(&ce->e_refcnt)));
294 +                       list_add_tail(&ce->e_lru_list, &free_list);
295 +                       l = &mb_cache_lru_list;
296 +                       spin_lock(&mb_cache_spinlock);
297                 }
298         }
299         spin_unlock(&mb_cache_spinlock);
300 -       list_for_each_safe(l, ltmp, &free_list) {
301 -               __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
302 -                                                  e_lru_list), GFP_KERNEL);
304 +       list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
305 +               __mb_cache_entry_forget(ce, GFP_KERNEL);
306         }
309 @@ -320,23 +447,27 @@ void
310  mb_cache_destroy(struct mb_cache *cache)
312         LIST_HEAD(free_list);
313 -       struct list_head *l, *ltmp;
314 +       struct mb_cache_entry *ce, *tmp;
316         spin_lock(&mb_cache_spinlock);
317 -       list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
318 -               struct mb_cache_entry *ce =
319 -                       list_entry(l, struct mb_cache_entry, e_lru_list);
320 -               if (ce->e_cache == cache) {
321 +       list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
322 +               if (ce->e_cache == cache)
323                         list_move_tail(&ce->e_lru_list, &free_list);
324 -                       __mb_cache_entry_unhash(ce);
325 -               }
326         }
327         list_del(&cache->c_cache_list);
328         spin_unlock(&mb_cache_spinlock);
330 -       list_for_each_safe(l, ltmp, &free_list) {
331 -               __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
332 -                                                  e_lru_list), GFP_KERNEL);
333 +       list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
334 +               list_del_init(&ce->e_lru_list);
335 +               /*
336 +                * Prevent any find or get operation on the entry.
337 +                */
338 +               hlist_bl_lock(ce->e_block_hash_p);
339 +               hlist_bl_lock(ce->e_index_hash_p);
340 +               mb_assert(!(ce->e_used || ce->e_queued ||
341 +                       atomic_read(&ce->e_refcnt)));
342 +               __mb_cache_entry_unhash_unlock(ce);
343 +               __mb_cache_entry_forget(ce, GFP_KERNEL);
344         }
346         if (atomic_read(&cache->c_entry_count) > 0) {
347 @@ -345,8 +476,6 @@ mb_cache_destroy(struct mb_cache *cache)
348                           atomic_read(&cache->c_entry_count));
349         }
351 -       kmem_cache_destroy(cache->c_entry_cache);
353         kfree(cache->c_index_hash);
354         kfree(cache->c_block_hash);
355         kfree(cache);
356 @@ -363,29 +492,59 @@ mb_cache_destroy(struct mb_cache *cache)
357  struct mb_cache_entry *
358  mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
360 -       struct mb_cache_entry *ce = NULL;
361 +       struct mb_cache_entry *ce;
363         if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
364 +               struct list_head *l;
366 +               l = &mb_cache_lru_list;
367                 spin_lock(&mb_cache_spinlock);
368 -               if (!list_empty(&mb_cache_lru_list)) {
369 -                       ce = list_entry(mb_cache_lru_list.next,
370 -                                       struct mb_cache_entry, e_lru_list);
371 -                       list_del_init(&ce->e_lru_list);
372 -                       __mb_cache_entry_unhash(ce);
373 +               while (!list_is_last(l, &mb_cache_lru_list)) {
374 +                       l = l->next;
375 +                       ce = list_entry(l, struct mb_cache_entry, e_lru_list);
376 +                       if (ce->e_cache == cache) {
377 +                               list_del_init(&ce->e_lru_list);
378 +                               if (ce->e_used || ce->e_queued ||
379 +                                       atomic_read(&ce->e_refcnt))
380 +                                       continue;
381 +                               spin_unlock(&mb_cache_spinlock);
382 +                               /*
383 +                                * Prevent any find or get operation on the
384 +                                * entry.
385 +                                */
386 +                               hlist_bl_lock(ce->e_block_hash_p);
387 +                               hlist_bl_lock(ce->e_index_hash_p);
388 +                               /* Ignore if it is touched by a find/get */
389 +                               if (ce->e_used || ce->e_queued ||
390 +                                       atomic_read(&ce->e_refcnt) ||
391 +                                       !list_empty(&ce->e_lru_list)) {
392 +                                       hlist_bl_unlock(ce->e_index_hash_p);
393 +                                       hlist_bl_unlock(ce->e_block_hash_p);
394 +                                       l = &mb_cache_lru_list;
395 +                                       spin_lock(&mb_cache_spinlock);
396 +                                       continue;
397 +                               }
398 +                               mb_assert(list_empty(&ce->e_lru_list));
399 +                               mb_assert(!(ce->e_used || ce->e_queued ||
400 +                                       atomic_read(&ce->e_refcnt)));
401 +                               __mb_cache_entry_unhash_unlock(ce);
402 +                               goto found;
403 +                       }
404                 }
405                 spin_unlock(&mb_cache_spinlock);
406         }
407 -       if (!ce) {
408 -               ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
409 -               if (!ce)
410 -                       return NULL;
411 -               atomic_inc(&cache->c_entry_count);
412 -               INIT_LIST_HEAD(&ce->e_lru_list);
413 -               INIT_HLIST_BL_NODE(&ce->e_block_list);
414 -               INIT_HLIST_BL_NODE(&ce->e_index.o_list);
415 -               ce->e_cache = cache;
416 -               ce->e_queued = 0;
417 -       }
419 +       ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
420 +       if (!ce)
421 +               return NULL;
422 +       atomic_inc(&cache->c_entry_count);
423 +       INIT_LIST_HEAD(&ce->e_lru_list);
424 +       INIT_HLIST_BL_NODE(&ce->e_block_list);
425 +       INIT_HLIST_BL_NODE(&ce->e_index.o_list);
426 +       ce->e_cache = cache;
427 +       ce->e_queued = 0;
428 +       atomic_set(&ce->e_refcnt, 0);
429 +found:
430         ce->e_block_hash_p = &cache->c_block_hash[0];
431         ce->e_index_hash_p = &cache->c_index_hash[0];
432         ce->e_used = 1 + MB_CACHE_WRITER;
433 @@ -414,7 +573,6 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
434         struct mb_cache *cache = ce->e_cache;
435         unsigned int bucket;
436         struct hlist_bl_node *l;
437 -       int error = -EBUSY;
438         struct hlist_bl_head *block_hash_p;
439         struct hlist_bl_head *index_hash_p;
440         struct mb_cache_entry *lce;
441 @@ -423,26 +581,29 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
442         bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
443                            cache->c_bucket_bits);
444         block_hash_p = &cache->c_block_hash[bucket];
445 -       spin_lock(&mb_cache_spinlock);
446 +       hlist_bl_lock(block_hash_p);
447         hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
448 -               if (lce->e_bdev == bdev && lce->e_block == block)
449 -                       goto out;
450 +               if (lce->e_bdev == bdev && lce->e_block == block) {
451 +                       hlist_bl_unlock(block_hash_p);
452 +                       return -EBUSY;
453 +               }
454         }
455         mb_assert(!__mb_cache_entry_is_block_hashed(ce));
456 -       __mb_cache_entry_unhash(ce);
457 +       __mb_cache_entry_unhash_block(ce);
458 +       __mb_cache_entry_unhash_index(ce);
459         ce->e_bdev = bdev;
460         ce->e_block = block;
461         ce->e_block_hash_p = block_hash_p;
462         ce->e_index.o_key = key;
463 +       hlist_bl_add_head(&ce->e_block_list, block_hash_p);
464 +       hlist_bl_unlock(block_hash_p);
465         bucket = hash_long(key, cache->c_bucket_bits);
466         index_hash_p = &cache->c_index_hash[bucket];
467 +       hlist_bl_lock(index_hash_p);
468         ce->e_index_hash_p = index_hash_p;
469         hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
470 -       hlist_bl_add_head(&ce->e_block_list, block_hash_p);
471 -       error = 0;
472 -out:
473 -       spin_unlock(&mb_cache_spinlock);
474 -       return error;
475 +       hlist_bl_unlock(index_hash_p);
476 +       return 0;
480 @@ -456,24 +617,26 @@ out:
481  void
482  mb_cache_entry_release(struct mb_cache_entry *ce)
484 -       spin_lock(&mb_cache_spinlock);
485 -       __mb_cache_entry_release_unlock(ce);
486 +       __mb_cache_entry_release(ce);
490  /*
491   * mb_cache_entry_free()
492   *
493 - * This is equivalent to the sequence mb_cache_entry_takeout() --
494 - * mb_cache_entry_release().
495   */
496  void
497  mb_cache_entry_free(struct mb_cache_entry *ce)
499 -       spin_lock(&mb_cache_spinlock);
500 +       mb_assert(ce);
501         mb_assert(list_empty(&ce->e_lru_list));
502 -       __mb_cache_entry_unhash(ce);
503 -       __mb_cache_entry_release_unlock(ce);
504 +       hlist_bl_lock(ce->e_index_hash_p);
505 +       __mb_cache_entry_unhash_index(ce);
506 +       hlist_bl_unlock(ce->e_index_hash_p);
507 +       hlist_bl_lock(ce->e_block_hash_p);
508 +       __mb_cache_entry_unhash_block(ce);
509 +       hlist_bl_unlock(ce->e_block_hash_p);
510 +       __mb_cache_entry_release(ce);
514 @@ -497,39 +660,48 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
515         bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
516                            cache->c_bucket_bits);
517         block_hash_p = &cache->c_block_hash[bucket];
518 -       spin_lock(&mb_cache_spinlock);
519 +       /* First serialize access to the block corresponding hash chain. */
520 +       hlist_bl_lock(block_hash_p);
521         hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
522                 mb_assert(ce->e_block_hash_p == block_hash_p);
523                 if (ce->e_bdev == bdev && ce->e_block == block) {
524 -                       DEFINE_WAIT(wait);
525 +                       /*
526 +                        * Prevent a free from removing the entry.
527 +                        */
528 +                       atomic_inc(&ce->e_refcnt);
529 +                       hlist_bl_unlock(block_hash_p);
530 +                       __spin_lock_mb_cache_entry(ce);
531 +                       atomic_dec(&ce->e_refcnt);
532 +                       if (ce->e_used > 0) {
533 +                               DEFINE_WAIT(wait);
534 +                               while (ce->e_used > 0) {
535 +                                       ce->e_queued++;
536 +                                       prepare_to_wait(&mb_cache_queue, &wait,
537 +                                                       TASK_UNINTERRUPTIBLE);
538 +                                       __spin_unlock_mb_cache_entry(ce);
539 +                                       schedule();
540 +                                       __spin_lock_mb_cache_entry(ce);
541 +                                       ce->e_queued--;
542 +                               }
543 +                               finish_wait(&mb_cache_queue, &wait);
544 +                       }
545 +                       ce->e_used += 1 + MB_CACHE_WRITER;
546 +                       __spin_unlock_mb_cache_entry(ce);
548 -                       if (!list_empty(&ce->e_lru_list))
549 +                       if (!list_empty(&ce->e_lru_list)) {
550 +                               spin_lock(&mb_cache_spinlock);
551                                 list_del_init(&ce->e_lru_list);
553 -                       while (ce->e_used > 0) {
554 -                               ce->e_queued++;
555 -                               prepare_to_wait(&mb_cache_queue, &wait,
556 -                                               TASK_UNINTERRUPTIBLE);
557                                 spin_unlock(&mb_cache_spinlock);
558 -                               schedule();
559 -                               spin_lock(&mb_cache_spinlock);
560 -                               ce->e_queued--;
561                         }
562 -                       finish_wait(&mb_cache_queue, &wait);
563 -                       ce->e_used += 1 + MB_CACHE_WRITER;
565                         if (!__mb_cache_entry_is_block_hashed(ce)) {
566 -                               __mb_cache_entry_release_unlock(ce);
567 +                               __mb_cache_entry_release(ce);
568                                 return NULL;
569                         }
570 -                       goto cleanup;
571 +                       return ce;
572                 }
573         }
574 -       ce = NULL;
576 -cleanup:
577 -       spin_unlock(&mb_cache_spinlock);
578 -       return ce;
579 +       hlist_bl_unlock(block_hash_p);
580 +       return NULL;
583  #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
584 @@ -538,40 +710,53 @@ static struct mb_cache_entry *
585  __mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
586                       struct block_device *bdev, unsigned int key)
589 +       /* The index hash chain is alredy acquire by caller. */
590         while (l != NULL) {
591                 struct mb_cache_entry *ce =
592                         hlist_bl_entry(l, struct mb_cache_entry,
593                                 e_index.o_list);
594                 mb_assert(ce->e_index_hash_p == head);
595                 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
596 -                       DEFINE_WAIT(wait);
598 -                       if (!list_empty(&ce->e_lru_list))
599 -                               list_del_init(&ce->e_lru_list);
601 +                       /*
602 +                        * Prevent a free from removing the entry.
603 +                        */
604 +                       atomic_inc(&ce->e_refcnt);
605 +                       hlist_bl_unlock(head);
606 +                       __spin_lock_mb_cache_entry(ce);
607 +                       atomic_dec(&ce->e_refcnt);
608 +                       ce->e_used++;
609                         /* Incrementing before holding the lock gives readers
610                            priority over writers. */
611 -                       ce->e_used++;
612 -                       while (ce->e_used >= MB_CACHE_WRITER) {
613 -                               ce->e_queued++;
614 -                               prepare_to_wait(&mb_cache_queue, &wait,
615 -                                               TASK_UNINTERRUPTIBLE);
616 -                               spin_unlock(&mb_cache_spinlock);
617 -                               schedule();
618 +                       if (ce->e_used >= MB_CACHE_WRITER) {
619 +                               DEFINE_WAIT(wait);
621 +                               while (ce->e_used >= MB_CACHE_WRITER) {
622 +                                       ce->e_queued++;
623 +                                       prepare_to_wait(&mb_cache_queue, &wait,
624 +                                                       TASK_UNINTERRUPTIBLE);
625 +                                       __spin_unlock_mb_cache_entry(ce);
626 +                                       schedule();
627 +                                       __spin_lock_mb_cache_entry(ce);
628 +                                       ce->e_queued--;
629 +                               }
630 +                               finish_wait(&mb_cache_queue, &wait);
631 +                       }
632 +                       __spin_unlock_mb_cache_entry(ce);
633 +                       if (!list_empty(&ce->e_lru_list)) {
634                                 spin_lock(&mb_cache_spinlock);
635 -                               ce->e_queued--;
636 +                               list_del_init(&ce->e_lru_list);
637 +                               spin_unlock(&mb_cache_spinlock);
638                         }
639 -                       finish_wait(&mb_cache_queue, &wait);
641                         if (!__mb_cache_entry_is_block_hashed(ce)) {
642 -                               __mb_cache_entry_release_unlock(ce);
643 -                               spin_lock(&mb_cache_spinlock);
644 +                               __mb_cache_entry_release(ce);
645                                 return ERR_PTR(-EAGAIN);
646                         }
647                         return ce;
648                 }
649                 l = l->next;
650         }
651 +       hlist_bl_unlock(head);
652         return NULL;
655 @@ -598,12 +783,12 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
656         struct hlist_bl_head *index_hash_p;
658         index_hash_p = &cache->c_index_hash[bucket];
659 -       spin_lock(&mb_cache_spinlock);
660 +       hlist_bl_lock(index_hash_p);
661         if (!hlist_bl_empty(index_hash_p)) {
662                 l = hlist_bl_first(index_hash_p);
663                 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
664 -       }
665 -       spin_unlock(&mb_cache_spinlock);
666 +       } else
667 +               hlist_bl_unlock(index_hash_p);
668         return ce;
671 @@ -638,11 +823,11 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
673         index_hash_p = &cache->c_index_hash[bucket];
674         mb_assert(prev->e_index_hash_p == index_hash_p);
675 -       spin_lock(&mb_cache_spinlock);
676 +       hlist_bl_lock(index_hash_p);
677         mb_assert(!hlist_bl_empty(index_hash_p));
678         l = prev->e_index.o_list.next;
679         ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
680 -       __mb_cache_entry_release_unlock(prev);
681 +       __mb_cache_entry_release(prev);
682         return ce;
685 -- 
686 1.7.11.3