1 mbcache: add reusable flag to cache entries
3 From: Andreas Gruenbacher <agruenba@redhat.com>
5 To reduce amount of damage caused by single bad block, we limit number
6 of inodes sharing an xattr block to 1024. Thus there can be more xattr
7 blocks with the same contents when there are lots of files with the same
8 extended attributes. These xattr blocks naturally result in hash
9 collisions and can form long hash chains and we unnecessarily check each
10 such block only to find out we cannot use it because it is already
11 shared by too many inodes.
13 Add a reusable flag to cache entries which is cleared when a cache entry
14 has reached its maximum refcount. Cache entries which are not marked
15 reusable are skipped by mb_cache_entry_find_{first,next}. This
16 significantly speeds up mbcache when there are many same xattr blocks.
17 For example for xattr-bench with 5 values and each process handling
18 20000 files, the run for 64 processes is 25x faster with this patch.
19 Even for 8 processes the speedup is almost 3x. We have also verified
20 that for situations where there is only one xattr block of each kind,
21 the patch doesn't have a measurable cost.
23 [JK: Remove handling of setting the same value since it is not needed
24 anymore, check for races in e_reusable setting, improve changelog,
27 Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
28 Signed-off-by: Jan Kara <jack@suse.cz>
29 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
31 fs/ext2/xattr.c | 2 +-
32 fs/ext4/xattr.c | 66 +++++++++++++++++++++++++++++++------------------
33 fs/mbcache.c | 38 +++++++++++++++++++++++++---
34 include/linux/mbcache.h | 5 +++-
35 4 files changed, 81 insertions(+), 30 deletions(-)
37 diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
38 index 71d58c2d7a19..1a5e3bff0b63 100644
41 @@ -823,7 +823,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
42 __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
45 - error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr);
46 + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
48 if (error == -EBUSY) {
49 ea_bdebug(bh, "already in cache (%d cache entries)",
50 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
51 index b661ae8332e3..0441e055c8e8 100644
54 @@ -545,6 +545,8 @@ static void
55 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
56 struct buffer_head *bh)
58 + struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
62 BUFFER_TRACE(bh, "get_write_access");
63 @@ -553,23 +555,34 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
67 - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
68 - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
70 + hash = le32_to_cpu(BHDR(bh)->h_hash);
71 + ref = le32_to_cpu(BHDR(bh)->h_refcount);
73 ea_bdebug(bh, "refcount now=0; freeing");
75 * This must happen under buffer lock for
76 * ext4_xattr_block_set() to reliably detect freed block
78 - mb_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash,
80 + mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
83 ext4_free_blocks(handle, inode, bh, 0, 1,
84 EXT4_FREE_BLOCKS_METADATA |
85 EXT4_FREE_BLOCKS_FORGET);
87 - le32_add_cpu(&BHDR(bh)->h_refcount, -1);
89 + BHDR(bh)->h_refcount = cpu_to_le32(ref);
90 + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
91 + struct mb_cache_entry *ce;
93 + ce = mb_cache_entry_get(ext4_mb_cache, hash,
97 + mb_cache_entry_put(ext4_mb_cache, ce);
102 * Beware of this ugliness: Releasing of xattr block references
103 * from different inodes can race and so we have to protect
104 @@ -872,6 +885,8 @@ inserted:
105 if (new_bh == bs->bh)
106 ea_bdebug(new_bh, "keeping");
110 /* The old block is released after updating
112 error = dquot_alloc_block(inode,
113 @@ -886,15 +901,18 @@ inserted:
116 * We have to be careful about races with
117 - * freeing or rehashing of xattr block. Once we
118 - * hold buffer lock xattr block's state is
119 - * stable so we can check whether the block got
120 - * freed / rehashed or not. Since we unhash
121 - * mbcache entry under buffer lock when freeing
122 - * / rehashing xattr block, checking whether
123 - * entry is still hashed is reliable.
124 + * freeing, rehashing or adding references to
125 + * xattr block. Once we hold buffer lock xattr
126 + * block's state is stable so we can check
127 + * whether the block got freed / rehashed or
128 + * not. Since we unhash mbcache entry under
129 + * buffer lock when freeing / rehashing xattr
130 + * block, checking whether entry is still
131 + * hashed is reliable. Same rules hold for
132 + * e_reusable handling.
134 - if (hlist_bl_unhashed(&ce->e_hash_list)) {
135 + if (hlist_bl_unhashed(&ce->e_hash_list) ||
138 * Undo everything and check mbcache
140 @@ -909,9 +927,12 @@ inserted:
144 - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
145 + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
146 + BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
147 + if (ref >= EXT4_XATTR_REFCOUNT_MAX)
148 + ce->e_reusable = 0;
149 ea_bdebug(new_bh, "reusing; refcount now=%d",
150 - le32_to_cpu(BHDR(new_bh)->h_refcount));
152 unlock_buffer(new_bh);
153 error = ext4_handle_dirty_xattr_block(handle,
155 @@ -1566,11 +1587,14 @@ cleanup:
157 ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
159 - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
160 + struct ext4_xattr_header *header = BHDR(bh);
161 + __u32 hash = le32_to_cpu(header->h_hash);
162 + int reusable = le32_to_cpu(header->h_refcount) <
163 + EXT4_XATTR_REFCOUNT_MAX;
166 error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
168 + bh->b_blocknr, reusable);
171 ea_bdebug(bh, "already in cache");
172 @@ -1645,12 +1669,6 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
174 EXT4_ERROR_INODE(inode, "block %lu read error",
175 (unsigned long) ce->e_block);
176 - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
177 - EXT4_XATTR_REFCOUNT_MAX) {
178 - ea_idebug(inode, "block %lu refcount %d>=%d",
179 - (unsigned long) ce->e_block,
180 - le32_to_cpu(BHDR(bh)->h_refcount),
181 - EXT4_XATTR_REFCOUNT_MAX);
182 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
185 diff --git a/fs/mbcache.c b/fs/mbcache.c
186 index 903be151dcfe..eccda3a02de6 100644
189 @@ -63,13 +63,14 @@ static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
190 * @mask - gfp mask with which the entry should be allocated
191 * @key - key of the entry
192 * @block - block that contains data
193 + * @reusable - is the block reusable by other inodes?
195 * Creates entry in @cache with key @key and records that data is stored in
196 * block @block. The function returns -EBUSY if entry with the same key
197 * and for the same block already exists in cache. Otherwise 0 is returned.
199 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
201 + sector_t block, bool reusable)
203 struct mb_cache_entry *entry, *dup;
204 struct hlist_bl_node *dup_node;
205 @@ -91,6 +92,7 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
206 atomic_set(&entry->e_refcnt, 1);
208 entry->e_block = block;
209 + entry->e_reusable = reusable;
210 head = mb_cache_entry_head(cache, key);
212 hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
213 @@ -137,7 +139,7 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
215 entry = hlist_bl_entry(node, struct mb_cache_entry,
217 - if (entry->e_key == key) {
218 + if (entry->e_key == key && entry->e_reusable) {
219 atomic_inc(&entry->e_refcnt);
222 @@ -184,10 +186,38 @@ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
224 EXPORT_SYMBOL(mb_cache_entry_find_next);
227 + * mb_cache_entry_get - get a cache entry by block number (and key)
228 + * @cache - cache we work with
229 + * @key - key of block number @block
230 + * @block - block number
232 +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
235 + struct hlist_bl_node *node;
236 + struct hlist_bl_head *head;
237 + struct mb_cache_entry *entry;
239 + head = mb_cache_entry_head(cache, key);
240 + hlist_bl_lock(head);
241 + hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
242 + if (entry->e_key == key && entry->e_block == block) {
243 + atomic_inc(&entry->e_refcnt);
249 + hlist_bl_unlock(head);
252 +EXPORT_SYMBOL(mb_cache_entry_get);
254 /* mb_cache_entry_delete_block - remove information about block from cache
255 * @cache - cache we work with
256 - * @key - key of the entry to remove
257 - * @block - block containing data for @key
258 + * @key - key of block @block
259 + * @block - block number
261 * Remove entry from cache @cache with key @key with data stored in @block.
263 diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
264 index 607e6968542e..86c9a8b480c5 100644
265 --- a/include/linux/mbcache.h
266 +++ b/include/linux/mbcache.h
267 @@ -18,6 +18,7 @@ struct mb_cache_entry {
268 /* Key in hash - stable during lifetime of the entry */
272 /* Block number of hashed block - stable during lifetime of the entry */
275 @@ -26,7 +27,7 @@ struct mb_cache *mb_cache_create(int bucket_bits);
276 void mb_cache_destroy(struct mb_cache *cache);
278 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
280 + sector_t block, bool reusable);
281 void __mb_cache_entry_free(struct mb_cache_entry *entry);
282 static inline int mb_cache_entry_put(struct mb_cache *cache,
283 struct mb_cache_entry *entry)
284 @@ -39,6 +40,8 @@ static inline int mb_cache_entry_put(struct mb_cache *cache,
286 void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
288 +struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
290 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
292 struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,