add patch do-not-fail-journal-because-of-frozen_buffer-allocation-failure
[ext4-patch-queue.git] / add-reusable-flag-to-cache-entries
blob88980d87f899d070881372fd695aa751fb146667
1 mbcache: add reusable flag to cache entries
3 From: Andreas Gruenbacher <agruenba@redhat.com>
5 To reduce amount of damage caused by single bad block, we limit number
6 of inodes sharing an xattr block to 1024. Thus there can be more xattr
7 blocks with the same contents when there are lots of files with the same
8 extended attributes. These xattr blocks naturally result in hash
9 collisions and can form long hash chains and we unnecessarily check each
10 such block only to find out we cannot use it because it is already
11 shared by too many inodes.
13 Add a reusable flag to cache entries which is cleared when a cache entry
14 has reached its maximum refcount.  Cache entries which are not marked
15 reusable are skipped by mb_cache_entry_find_{first,next}. This
16 significantly speeds up mbcache when there are many same xattr blocks.
17 For example for xattr-bench with 5 values and each process handling
18 20000 files, the run for 64 processes is 25x faster with this patch.
19 Even for 8 processes the speedup is almost 3x. We have also verified
20 that for situations where there is only one xattr block of each kind,
21 the patch doesn't have a measurable cost.
23 [JK: Remove handling of setting the same value since it is not needed
24 anymore, check for races in e_reusable setting, improve changelog,
25 add measurements]
27 Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
28 Signed-off-by: Jan Kara <jack@suse.cz>
29 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
30 ---
31  fs/ext2/xattr.c         |  2 +-
32  fs/ext4/xattr.c         | 66 +++++++++++++++++++++++++++++++------------------
33  fs/mbcache.c            | 38 +++++++++++++++++++++++++---
34  include/linux/mbcache.h |  5 +++-
35  4 files changed, 81 insertions(+), 30 deletions(-)
37 diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
38 index 71d58c2d7a19..1a5e3bff0b63 100644
39 --- a/fs/ext2/xattr.c
40 +++ b/fs/ext2/xattr.c
41 @@ -823,7 +823,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
42         __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
43         int error;
45 -       error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr);
46 +       error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
47         if (error) {
48                 if (error == -EBUSY) {
49                         ea_bdebug(bh, "already in cache (%d cache entries)",
50 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
51 index b661ae8332e3..0441e055c8e8 100644
52 --- a/fs/ext4/xattr.c
53 +++ b/fs/ext4/xattr.c
54 @@ -545,6 +545,8 @@ static void
55  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
56                          struct buffer_head *bh)
57  {
58 +       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
59 +       u32 hash, ref;
60         int error = 0;
62         BUFFER_TRACE(bh, "get_write_access");
63 @@ -553,23 +555,34 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
64                 goto out;
66         lock_buffer(bh);
67 -       if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
68 -               __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
70 +       hash = le32_to_cpu(BHDR(bh)->h_hash);
71 +       ref = le32_to_cpu(BHDR(bh)->h_refcount);
72 +       if (ref == 1) {
73                 ea_bdebug(bh, "refcount now=0; freeing");
74                 /*
75                  * This must happen under buffer lock for
76                  * ext4_xattr_block_set() to reliably detect freed block
77                  */
78 -               mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash,
79 -                                           bh->b_blocknr);
80 +               mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
81                 get_bh(bh);
82                 unlock_buffer(bh);
83                 ext4_free_blocks(handle, inode, bh, 0, 1,
84                                  EXT4_FREE_BLOCKS_METADATA |
85                                  EXT4_FREE_BLOCKS_FORGET);
86         } else {
87 -               le32_add_cpu(&BHDR(bh)->h_refcount, -1);
88 +               ref--;
89 +               BHDR(bh)->h_refcount = cpu_to_le32(ref);
90 +               if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
91 +                       struct mb_cache_entry *ce;
93 +                       ce = mb_cache_entry_get(ext4_mb_cache, hash,
94 +                                               bh->b_blocknr);
95 +                       if (ce) {
96 +                               ce->e_reusable = 1;
97 +                               mb_cache_entry_put(ext4_mb_cache, ce);
98 +                       }
99 +               }
101                 /*
102                  * Beware of this ugliness: Releasing of xattr block references
103                  * from different inodes can race and so we have to protect
104 @@ -872,6 +885,8 @@ inserted:
105                         if (new_bh == bs->bh)
106                                 ea_bdebug(new_bh, "keeping");
107                         else {
108 +                               u32 ref;
110                                 /* The old block is released after updating
111                                    the inode. */
112                                 error = dquot_alloc_block(inode,
113 @@ -886,15 +901,18 @@ inserted:
114                                 lock_buffer(new_bh);
115                                 /*
116                                  * We have to be careful about races with
117 -                                * freeing or rehashing of xattr block. Once we
118 -                                * hold buffer lock xattr block's state is
119 -                                * stable so we can check whether the block got
120 -                                * freed / rehashed or not.  Since we unhash
121 -                                * mbcache entry under buffer lock when freeing
122 -                                * / rehashing xattr block, checking whether
123 -                                * entry is still hashed is reliable.
124 +                                * freeing, rehashing or adding references to
125 +                                * xattr block. Once we hold buffer lock xattr
126 +                                * block's state is stable so we can check
127 +                                * whether the block got freed / rehashed or
128 +                                * not.  Since we unhash mbcache entry under
129 +                                * buffer lock when freeing / rehashing xattr
130 +                                * block, checking whether entry is still
131 +                                * hashed is reliable. Same rules hold for
132 +                                * e_reusable handling.
133                                  */
134 -                               if (hlist_bl_unhashed(&ce->e_hash_list)) {
135 +                               if (hlist_bl_unhashed(&ce->e_hash_list) ||
136 +                                   !ce->e_reusable) {
137                                         /*
138                                          * Undo everything and check mbcache
139                                          * again.
140 @@ -909,9 +927,12 @@ inserted:
141                                         new_bh = NULL;
142                                         goto inserted;
143                                 }
144 -                               le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
145 +                               ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
146 +                               BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
147 +                               if (ref >= EXT4_XATTR_REFCOUNT_MAX)
148 +                                       ce->e_reusable = 0;
149                                 ea_bdebug(new_bh, "reusing; refcount now=%d",
150 -                                       le32_to_cpu(BHDR(new_bh)->h_refcount));
151 +                                         ref);
152                                 unlock_buffer(new_bh);
153                                 error = ext4_handle_dirty_xattr_block(handle,
154                                                                       inode,
155 @@ -1566,11 +1587,14 @@ cleanup:
156  static void
157  ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
159 -       __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
160 +       struct ext4_xattr_header *header = BHDR(bh);
161 +       __u32 hash = le32_to_cpu(header->h_hash);
162 +       int reusable = le32_to_cpu(header->h_refcount) <
163 +                      EXT4_XATTR_REFCOUNT_MAX;
164         int error;
166         error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
167 -                                     bh->b_blocknr);
168 +                                     bh->b_blocknr, reusable);
169         if (error) {
170                 if (error == -EBUSY)
171                         ea_bdebug(bh, "already in cache");
172 @@ -1645,12 +1669,6 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
173                 if (!bh) {
174                         EXT4_ERROR_INODE(inode, "block %lu read error",
175                                          (unsigned long) ce->e_block);
176 -               } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
177 -                               EXT4_XATTR_REFCOUNT_MAX) {
178 -                       ea_idebug(inode, "block %lu refcount %d>=%d",
179 -                                 (unsigned long) ce->e_block,
180 -                                 le32_to_cpu(BHDR(bh)->h_refcount),
181 -                                         EXT4_XATTR_REFCOUNT_MAX);
182                 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
183                         *pce = ce;
184                         return bh;
185 diff --git a/fs/mbcache.c b/fs/mbcache.c
186 index 903be151dcfe..eccda3a02de6 100644
187 --- a/fs/mbcache.c
188 +++ b/fs/mbcache.c
189 @@ -63,13 +63,14 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
190   * @mask - gfp mask with which the entry should be allocated
191   * @key - key of the entry
192   * @block - block that contains data
193 + * @reusable - is the block reusable by other inodes?
194   *
195   * Creates entry in @cache with key @key and records that data is stored in
196   * block @block. The function returns -EBUSY if entry with the same key
197   * and for the same block already exists in cache. Otherwise 0 is returned.
198   */
199  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
200 -                         sector_t block)
201 +                         sector_t block, bool reusable)
203         struct mb_cache_entry *entry, *dup;
204         struct hlist_bl_node *dup_node;
205 @@ -91,6 +92,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
206         atomic_set(&entry->e_refcnt, 1);
207         entry->e_key = key;
208         entry->e_block = block;
209 +       entry->e_reusable = reusable;
210         head = mb_cache_entry_head(cache, key);
211         hlist_bl_lock(head);
212         hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
213 @@ -137,7 +139,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
214         while (node) {
215                 entry = hlist_bl_entry(node, struct mb_cache_entry,
216                                        e_hash_list);
217 -               if (entry->e_key == key) {
218 +               if (entry->e_key == key && entry->e_reusable) {
219                         atomic_inc(&entry->e_refcnt);
220                         goto out;
221                 }
222 @@ -184,10 +186,38 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
224  EXPORT_SYMBOL(mb_cache_entry_find_next);
227 + * mb_cache_entry_get - get a cache entry by block number (and key)
228 + * @cache - cache we work with
229 + * @key - key of block number @block
230 + * @block - block number
231 + */
232 +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
233 +                                         sector_t block)
235 +       struct hlist_bl_node *node;
236 +       struct hlist_bl_head *head;
237 +       struct mb_cache_entry *entry;
239 +       head = mb_cache_entry_head(cache, key);
240 +       hlist_bl_lock(head);
241 +       hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
242 +               if (entry->e_key == key && entry->e_block == block) {
243 +                       atomic_inc(&entry->e_refcnt);
244 +                       goto out;
245 +               }
246 +       }
247 +       entry = NULL;
248 +out:
249 +       hlist_bl_unlock(head);
250 +       return entry;
252 +EXPORT_SYMBOL(mb_cache_entry_get);
254  /* mb_cache_entry_delete_block - remove information about block from cache
255   * @cache - cache we work with
256 - * @key - key of the entry to remove
257 - * @block - block containing data for @key
258 + * @key - key of block @block
259 + * @block - block number
260   *
261   * Remove entry from cache @cache with key @key with data stored in @block.
262   */
263 diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
264 index 607e6968542e..86c9a8b480c5 100644
265 --- a/include/linux/mbcache.h
266 +++ b/include/linux/mbcache.h
267 @@ -18,6 +18,7 @@ struct mb_cache_entry {
268         /* Key in hash - stable during lifetime of the entry */
269         u32                     e_key;
270         u32                     e_referenced:1;
271 +       u32                     e_reusable:1;
272         /* Block number of hashed block - stable during lifetime of the entry */
273         sector_t                e_block;
274  };
275 @@ -26,7 +27,7 @@ struct mb_cache *mb_cache_create(int bucket_bits);
276  void mb_cache_destroy(struct mb_cache *cache);
278  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
279 -                         sector_t block);
280 +                         sector_t block, bool reusable);
281  void __mb_cache_entry_free(struct mb_cache_entry *entry);
282  static inline int mb_cache_entry_put(struct mb_cache *cache,
283                                      struct mb_cache_entry *entry)
284 @@ -39,6 +40,8 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
286  void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
287                                   sector_t block);
288 +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
289 +                                         sector_t block);
290  struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
291                                                  u32 key);
292  struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
293 -- 
294 2.6.2