From 86a1947e5ddd2088c81198e73af3c961a4bcec35 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 19 Feb 2016 15:53:08 -0500 Subject: [PATCH] Add Jan Kara's mbcache 2 patch set --- ext2-convert-to-mbcache2 | 512 +++++++++++++++ ext4-convert-to-mbcache2 | 389 ++++++++++++ mbcache2-limit-cache-size | 139 +++++ mbcache2-use-referenced-bit-instead-of-LRU | 139 +++++ reimplement-mbcache | 541 ++++++++++++++++ remove-mbcache | 959 +++++++++++++++++++++++++++++ series | 9 + timestamps | 12 +- 8 files changed, 2697 insertions(+), 3 deletions(-) create mode 100644 ext2-convert-to-mbcache2 create mode 100644 ext4-convert-to-mbcache2 create mode 100644 mbcache2-limit-cache-size create mode 100644 mbcache2-use-referenced-bit-instead-of-LRU create mode 100644 reimplement-mbcache create mode 100644 remove-mbcache diff --git a/ext2-convert-to-mbcache2 b/ext2-convert-to-mbcache2 new file mode 100644 index 00000000..00bc485d --- /dev/null +++ b/ext2-convert-to-mbcache2 @@ -0,0 +1,512 @@ +ext2: convert to mbcache2 + +From: Jan Kara + +The conversion is generally straightforward. We convert filesystem from +a global cache to per-fs one. Similarly to ext4 the tricky part is that +xattr block corresponding to found mbcache entry can get freed before we +get buffer lock for that block. So we have to check whether the entry is +still valid after getting the buffer lock. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext2/ext2.h | 3 ++ + fs/ext2/super.c | 25 ++++++---- + fs/ext2/xattr.c | 146 +++++++++++++++++++++++++++----------------------------- + fs/ext2/xattr.h | 21 ++------ + 4 files changed, 95 insertions(+), 100 deletions(-) + +diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h +index 4c69c94cafd8..f98ce7e60a0f 100644 +--- a/fs/ext2/ext2.h ++++ b/fs/ext2/ext2.h +@@ -61,6 +61,8 @@ struct ext2_block_alloc_info { + #define rsv_start rsv_window._rsv_start + #define rsv_end rsv_window._rsv_end + ++struct mb2_cache; ++ + /* + * second extended-fs super-block data in memory + */ +@@ -111,6 +113,7 @@ struct ext2_sb_info { + * of the mount options. + */ + spinlock_t s_lock; ++ struct mb2_cache *s_mb_cache; + }; + + static inline spinlock_t * +diff --git a/fs/ext2/super.c b/fs/ext2/super.c +index 748d35afc902..111a31761ffa 100644 +--- a/fs/ext2/super.c ++++ b/fs/ext2/super.c +@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb) + + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + +- ext2_xattr_put_super(sb); ++ if (sbi->s_mb_cache) { ++ ext2_xattr_destroy_cache(sbi->s_mb_cache); ++ sbi->s_mb_cache = NULL; ++ } + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = sbi->s_es; + +@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) + ext2_msg(sb, KERN_ERR, "error: insufficient memory"); + goto failed_mount3; + } ++ ++#ifdef CONFIG_EXT2_FS_XATTR ++ sbi->s_mb_cache = ext2_xattr_create_cache(); ++ if (!sbi->s_mb_cache) { ++ ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); ++ goto failed_mount3; ++ } ++#endif + /* + * set up enough so that it can read an inode + */ +@@ -1149,6 +1160,8 @@ cantfind_ext2: + sb->s_id); + goto failed_mount; + failed_mount3: ++ if (sbi->s_mb_cache) ++ ext2_xattr_destroy_cache(sbi->s_mb_cache); + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); +@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2"); + + static int __init init_ext2_fs(void) + { +- int err = init_ext2_xattr(); +- if (err) +- return err; ++ int err; ++ + err = init_inodecache(); + if (err) +- goto out1; ++ return err; + err = register_filesystem(&ext2_fs_type); + if (err) + goto out; + return 0; + out: + destroy_inodecache(); +-out1: +- exit_ext2_xattr(); + return err; + } + +@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void) + { + unregister_filesystem(&ext2_fs_type); + destroy_inodecache(); +- exit_ext2_xattr(); + } + + MODULE_AUTHOR("Remy Card and others"); +diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c +index fa70848afa8f..c7ab4cadcea0 100644 +--- a/fs/ext2/xattr.c ++++ b/fs/ext2/xattr.c +@@ -56,7 +56,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -92,14 +92,12 @@ + static int ext2_xattr_set2(struct inode *, struct buffer_head *, + struct ext2_xattr_header *); + +-static int ext2_xattr_cache_insert(struct buffer_head *); ++static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); + static struct buffer_head *ext2_xattr_cache_find(struct inode *, + struct ext2_xattr_header *); + static void ext2_xattr_rehash(struct ext2_xattr_header *, + struct ext2_xattr_entry *); + +-static struct mb_cache *ext2_xattr_cache; +- + static const struct xattr_handler *ext2_xattr_handler_map[] = { + [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, + #ifdef CONFIG_EXT2_FS_POSIX_ACL +@@ -154,6 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, + size_t name_len, size; + char *end; + int error; ++ struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); +@@ -198,7 +197,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", + goto found; + entry = next; + } +- if (ext2_xattr_cache_insert(bh)) ++ if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + ea_idebug(inode, "cache insert failed"); + error = -ENODATA; + goto cleanup; +@@ -211,7 +210,7 @@ found: + le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) + goto bad_block; + +- if (ext2_xattr_cache_insert(bh)) ++ if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + ea_idebug(inode, "cache insert failed"); + if (buffer) { + error = -ERANGE; +@@ -249,6 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + char *end; + size_t rest = buffer_size; + int error; ++ struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); +@@ -283,7 +283,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", + goto bad_block; + entry = next; + } +- if (ext2_xattr_cache_insert(bh)) ++ if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) + ea_idebug(inode, "cache insert failed"); + + /* list the attribute names */ +@@ -480,22 +480,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set", + /* Here we know that we can set the new attribute. */ + + if (header) { +- struct mb_cache_entry *ce; +- + /* assert(header == HDR(bh)); */ +- ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, +- bh->b_blocknr); + lock_buffer(bh); + if (header->h_refcount == cpu_to_le32(1)) { ++ __u32 hash = le32_to_cpu(header->h_hash); ++ + ea_bdebug(bh, "modifying in-place"); +- if (ce) +- mb_cache_entry_free(ce); ++ /* ++ * This must happen under buffer lock for ++ * ext2_xattr_set2() to reliably detect modified block ++ */ ++ mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, ++ hash, bh->b_blocknr); ++ + /* keep the buffer locked while modifying it. */ + } else { + int offset; + +- if (ce) +- mb_cache_entry_release(ce); + unlock_buffer(bh); + ea_bdebug(bh, "cloning"); + header = kmalloc(bh->b_size, GFP_KERNEL); +@@ -623,6 +624,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + int error; ++ struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; + + if (header) { + new_bh = ext2_xattr_cache_find(inode, header); +@@ -650,7 +652,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + don't need to change the reference count. */ + new_bh = old_bh; + get_bh(new_bh); +- ext2_xattr_cache_insert(new_bh); ++ ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + } else { + /* We need to allocate a new block */ + ext2_fsblk_t goal = ext2_group_first_block_no(sb, +@@ -671,7 +673,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + memcpy(new_bh->b_data, header, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); +- ext2_xattr_cache_insert(new_bh); ++ ext2_xattr_cache_insert(ext2_mb_cache, new_bh); + + ext2_xattr_update_super_block(sb); + } +@@ -704,19 +706,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + + error = 0; + if (old_bh && old_bh != new_bh) { +- struct mb_cache_entry *ce; +- + /* + * If there was an old block and we are no longer using it, + * release the old block. + */ +- ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, +- old_bh->b_blocknr); + lock_buffer(old_bh); + if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { ++ __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash); ++ ++ /* ++ * This must happen under buffer lock for ++ * ext2_xattr_set2() to reliably detect freed block ++ */ ++ mb2_cache_entry_delete_block(ext2_mb_cache, ++ hash, old_bh->b_blocknr); + /* Free the old block. */ +- if (ce) +- mb_cache_entry_free(ce); + ea_bdebug(old_bh, "freeing"); + ext2_free_blocks(inode, old_bh->b_blocknr, 1); + mark_inode_dirty(inode); +@@ -727,8 +731,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, + } else { + /* Decrement the refcount only. */ + le32_add_cpu(&HDR(old_bh)->h_refcount, -1); +- if (ce) +- mb_cache_entry_release(ce); + dquot_free_block_nodirty(inode, 1); + mark_inode_dirty(inode); + mark_buffer_dirty(old_bh); +@@ -754,7 +756,6 @@ void + ext2_xattr_delete_inode(struct inode *inode) + { + struct buffer_head *bh = NULL; +- struct mb_cache_entry *ce; + + down_write(&EXT2_I(inode)->xattr_sem); + if (!EXT2_I(inode)->i_file_acl) +@@ -774,19 +775,22 @@ ext2_xattr_delete_inode(struct inode *inode) + EXT2_I(inode)->i_file_acl); + goto cleanup; + } +- ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); + lock_buffer(bh); + if (HDR(bh)->h_refcount == cpu_to_le32(1)) { +- if (ce) +- mb_cache_entry_free(ce); ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ ++ /* ++ * This must happen under buffer lock for ext2_xattr_set2() to ++ * reliably detect freed block ++ */ ++ mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, ++ hash, bh->b_blocknr); + ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); + get_bh(bh); + bforget(bh); + unlock_buffer(bh); + } else { + le32_add_cpu(&HDR(bh)->h_refcount, -1); +- if (ce) +- mb_cache_entry_release(ce); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + unlock_buffer(bh); +@@ -803,18 +807,6 @@ cleanup: + } + + /* +- * ext2_xattr_put_super() +- * +- * This is called when a file system is unmounted. +- */ +-void +-ext2_xattr_put_super(struct super_block *sb) +-{ +- mb_cache_shrink(sb->s_bdev); +-} +- +- +-/* + * ext2_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert +@@ -823,27 +815,22 @@ ext2_xattr_put_super(struct super_block *sb) + * Returns 0, or a negative error number on failure. + */ + static int +-ext2_xattr_cache_insert(struct buffer_head *bh) ++ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh) + { + __u32 hash = le32_to_cpu(HDR(bh)->h_hash); +- struct mb_cache_entry *ce; ++ struct mb2_cache_entry *ce; + int error; + +- ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); +- if (!ce) +- return -ENOMEM; +- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); +- if (error) { +- mb_cache_entry_free(ce); +- if (error == -EBUSY) { ++ ce = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr); ++ if (IS_ERR(ce)) { ++ if (PTR_ERR(ce) == -EBUSY) { + ea_bdebug(bh, "already in cache (%d cache entries)", + atomic_read(&ext2_xattr_cache->c_entry_count)); + error = 0; + } + } else { +- ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, +- atomic_read(&ext2_xattr_cache->c_entry_count)); +- mb_cache_entry_release(ce); ++ ea_bdebug(bh, "inserting [%x]", (int)hash); ++ mb2_cache_entry_put(cache, ce); + } + return error; + } +@@ -900,23 +887,17 @@ static struct buffer_head * + ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) + { + __u32 hash = le32_to_cpu(header->h_hash); +- struct mb_cache_entry *ce; ++ struct mb2_cache_entry *ce; ++ struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); + again: +- ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, +- hash); ++ ce = mb2_cache_entry_find_first(ext2_mb_cache, hash); + while (ce) { + struct buffer_head *bh; + +- if (IS_ERR(ce)) { +- if (PTR_ERR(ce) == -EAGAIN) +- goto again; +- break; +- } +- + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + ext2_error(inode->i_sb, "ext2_xattr_cache_find", +@@ -924,7 +905,21 @@ again: + inode->i_ino, (unsigned long) ce->e_block); + } else { + lock_buffer(bh); +- if (le32_to_cpu(HDR(bh)->h_refcount) > ++ /* ++ * We have to be careful about races with freeing or ++ * rehashing of xattr block. Once we hold buffer lock ++ * xattr block's state is stable so we can check ++ * whether the block got freed / rehashed or not. ++ * Since we unhash mbcache entry under buffer lock when ++ * freeing / rehashing xattr block, checking whether ++ * entry is still hashed is reliable. ++ */ ++ if (hlist_bl_unhashed(&ce->e_hash_list)) { ++ mb2_cache_entry_put(ext2_mb_cache, ce); ++ unlock_buffer(bh); ++ brelse(bh); ++ goto again; ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > + EXT2_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %ld refcount %d>%d", + (unsigned long) ce->e_block, +@@ -933,13 +928,14 @@ again: + } else if (!ext2_xattr_cmp(header, HDR(bh))) { + ea_bdebug(bh, "b_count=%d", + atomic_read(&(bh->b_count))); +- mb_cache_entry_release(ce); ++ mb2_cache_entry_touch(ext2_mb_cache, ce); ++ mb2_cache_entry_put(ext2_mb_cache, ce); + return bh; + } + unlock_buffer(bh); + brelse(bh); + } +- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); ++ ce = mb2_cache_entry_find_next(ext2_mb_cache, ce); + } + return NULL; + } +@@ -1012,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, + + #undef BLOCK_HASH_SHIFT + +-int __init +-init_ext2_xattr(void) ++#define HASH_BUCKET_BITS 10 ++ ++struct mb2_cache *ext2_xattr_create_cache(void) + { +- ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); +- if (!ext2_xattr_cache) +- return -ENOMEM; +- return 0; ++ return mb2_cache_create(HASH_BUCKET_BITS); + } + +-void +-exit_ext2_xattr(void) ++void ext2_xattr_destroy_cache(struct mb2_cache *cache) + { +- mb_cache_destroy(ext2_xattr_cache); ++ if (cache) ++ mb2_cache_destroy(cache); + } +diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h +index 60edf298644e..6ea38aa9563a 100644 +--- a/fs/ext2/xattr.h ++++ b/fs/ext2/xattr.h +@@ -53,6 +53,8 @@ struct ext2_xattr_entry { + #define EXT2_XATTR_SIZE(size) \ + (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) + ++struct mb2_cache; ++ + # ifdef CONFIG_EXT2_FS_XATTR + + extern const struct xattr_handler ext2_xattr_user_handler; +@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); + extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); + + extern void ext2_xattr_delete_inode(struct inode *); +-extern void ext2_xattr_put_super(struct super_block *); + +-extern int init_ext2_xattr(void); +-extern void exit_ext2_xattr(void); ++extern struct mb2_cache *ext2_xattr_create_cache(void); ++extern void ext2_xattr_destroy_cache(struct mb2_cache *cache); + + extern const struct xattr_handler *ext2_xattr_handlers[]; + +@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) + { + } + +-static inline void +-ext2_xattr_put_super(struct super_block *sb) +-{ +-} +- +-static inline int +-init_ext2_xattr(void) +-{ +- return 0; +-} +- +-static inline void +-exit_ext2_xattr(void) ++static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache) + { + } + +-- +2.1.4 + + diff --git a/ext4-convert-to-mbcache2 b/ext4-convert-to-mbcache2 new file mode 100644 index 00000000..88b8385f --- /dev/null +++ b/ext4-convert-to-mbcache2 @@ -0,0 +1,389 @@ +ext4: convert to mbcache2 + +From: Jan Kara + +The conversion is generally straightforward. The only tricky part is +that xattr block corresponding to found mbcache entry can get freed +before we get buffer lock for that block. So we have to check whether +the entry is still valid after getting buffer lock. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/ext4.h | 2 +- + fs/ext4/super.c | 7 ++- + fs/ext4/xattr.c | 135 +++++++++++++++++++++++++++++--------------------------- + fs/ext4/xattr.h | 5 +-- + 4 files changed, 77 insertions(+), 72 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 750063f7a50c..068a3eaa41ac 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1371,7 +1371,7 @@ struct ext4_sb_info { + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; + struct ext4_es_stats s_es_stats; +- struct mb_cache *s_mb_cache; ++ struct mb2_cache *s_mb_cache; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; + + /* Ratelimit ext4 messages. */ +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index c9ab67da6e5a..dbd5b9b9c99a 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -814,7 +814,6 @@ static void ext4_put_super(struct super_block *sb) + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); +- ext4_xattr_put_super(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + ext4_clear_feature_journal_needs_recovery(sb); +@@ -3759,7 +3758,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) + + no_journal: + if (ext4_mballoc_ready) { +- sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); ++ sbi->s_mb_cache = ext4_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; +@@ -3989,6 +3988,10 @@ failed_mount4: + if (EXT4_SB(sb)->rsv_conversion_wq) + destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); + failed_mount_wq: ++ if (sbi->s_mb_cache) { ++ ext4_xattr_destroy_cache(sbi->s_mb_cache); ++ sbi->s_mb_cache = NULL; ++ } + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 6b6b3e751f8c..a80e5e2acadd 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -53,7 +53,7 @@ + #include + #include + #include +-#include ++#include + #include + #include "ext4_jbd2.h" + #include "ext4.h" +@@ -80,10 +80,10 @@ + # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) + #endif + +-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *); ++static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *); + static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, +- struct mb_cache_entry **); ++ struct mb2_cache_entry **); + static void ext4_xattr_rehash(struct ext4_xattr_header *, + struct ext4_xattr_entry *); + static int ext4_xattr_list(struct dentry *dentry, char *buffer, +@@ -278,7 +278,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + struct ext4_xattr_entry *entry; + size_t size; + int error; +- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ++ struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); +@@ -425,7 +425,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) + struct inode *inode = d_inode(dentry); + struct buffer_head *bh = NULL; + int error; +- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ++ struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); +@@ -542,11 +542,8 @@ static void + ext4_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) + { +- struct mb_cache_entry *ce = NULL; + int error = 0; +- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + +- ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); + BUFFER_TRACE(bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, bh); + if (error) +@@ -554,9 +551,15 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, + + lock_buffer(bh); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { ++ __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); ++ + ea_bdebug(bh, "refcount now=0; freeing"); +- if (ce) +- mb_cache_entry_free(ce); ++ /* ++ * This must happen under buffer lock for ++ * ext4_xattr_block_set() to reliably detect freed block ++ */ ++ mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash, ++ bh->b_blocknr); + get_bh(bh); + unlock_buffer(bh); + ext4_free_blocks(handle, inode, bh, 0, 1, +@@ -564,8 +567,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, + EXT4_FREE_BLOCKS_FORGET); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); +- if (ce) +- mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect +@@ -778,17 +779,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext4_xattr_search *s = &bs->s; +- struct mb_cache_entry *ce = NULL; ++ struct mb2_cache_entry *ce = NULL; + int error = 0; +- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ++ struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + #define header(x) ((struct ext4_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { +- ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, +- bs->bh->b_blocknr); + BUFFER_TRACE(bs->bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) +@@ -796,10 +795,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { +- if (ce) { +- mb_cache_entry_free(ce); +- ce = NULL; +- } ++ __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); ++ ++ /* ++ * This must happen under buffer lock for ++ * ext4_xattr_block_set() to reliably detect modified ++ * block ++ */ ++ mb2_cache_entry_delete_block(ext4_mb_cache, hash, ++ bs->bh->b_blocknr); + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s); + if (!error) { +@@ -823,10 +827,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); +- if (ce) { +- mb_cache_entry_release(ce); +- ce = NULL; +- } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; +@@ -881,6 +881,31 @@ inserted: + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); ++ /* ++ * We have to be careful about races with ++ * freeing or rehashing of xattr block. Once we ++ * hold buffer lock xattr block's state is ++ * stable so we can check whether the block got ++ * freed / rehashed or not. Since we unhash ++ * mbcache entry under buffer lock when freeing ++ * / rehashing xattr block, checking whether ++ * entry is still hashed is reliable. ++ */ ++ if (hlist_bl_unhashed(&ce->e_hash_list)) { ++ /* ++ * Undo everything and check mbcache ++ * again. ++ */ ++ unlock_buffer(new_bh); ++ dquot_free_block(inode, ++ EXT4_C2B(EXT4_SB(sb), ++ 1)); ++ brelse(new_bh); ++ mb2_cache_entry_put(ext4_mb_cache, ce); ++ ce = NULL; ++ new_bh = NULL; ++ goto inserted; ++ } + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); +@@ -891,7 +916,8 @@ inserted: + if (error) + goto cleanup_dquot; + } +- mb_cache_entry_release(ce); ++ mb2_cache_entry_touch(ext4_mb_cache, ce); ++ mb2_cache_entry_put(ext4_mb_cache, ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ +@@ -956,7 +982,7 @@ getblk_failed: + + cleanup: + if (ce) +- mb_cache_entry_release(ce); ++ mb2_cache_entry_put(ext4_mb_cache, ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); +@@ -1509,17 +1535,6 @@ cleanup: + } + + /* +- * ext4_xattr_put_super() +- * +- * This is called when a file system is unmounted. +- */ +-void +-ext4_xattr_put_super(struct super_block *sb) +-{ +- mb_cache_shrink(sb->s_bdev); +-} +- +-/* + * ext4_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert +@@ -1528,27 +1543,22 @@ ext4_xattr_put_super(struct super_block *sb) + * Returns 0, or a negative error number on failure. + */ + static void +-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) ++ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh) + { + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); +- struct mb_cache_entry *ce; ++ struct mb2_cache_entry *ce; + int error; + +- ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); +- if (!ce) { +- ea_bdebug(bh, "out of memory"); +- return; +- } +- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); +- if (error) { +- mb_cache_entry_free(ce); +- if (error == -EBUSY) { ++ ce = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, ++ bh->b_blocknr); ++ if (IS_ERR(ce)) { ++ if (PTR_ERR(ce) == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); +- mb_cache_entry_release(ce); ++ mb2_cache_entry_put(ext4_mb_cache, ce); + } + } + +@@ -1602,26 +1612,19 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, + */ + static struct buffer_head * + ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, +- struct mb_cache_entry **pce) ++ struct mb2_cache_entry **pce) + { + __u32 hash = le32_to_cpu(header->h_hash); +- struct mb_cache_entry *ce; +- struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); ++ struct mb2_cache_entry *ce; ++ struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +-again: +- ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, +- hash); ++ ce = mb2_cache_entry_find_first(ext4_mb_cache, hash); + while (ce) { + struct buffer_head *bh; + +- if (IS_ERR(ce)) { +- if (PTR_ERR(ce) == -EAGAIN) +- goto again; +- break; +- } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %lu read error", +@@ -1637,7 +1640,7 @@ again: + return bh; + } + brelse(bh); +- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); ++ ce = mb2_cache_entry_find_next(ext4_mb_cache, ce); + } + return NULL; + } +@@ -1712,15 +1715,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, + + #define HASH_BUCKET_BITS 10 + +-struct mb_cache * +-ext4_xattr_create_cache(char *name) ++struct mb2_cache * ++ext4_xattr_create_cache(void) + { +- return mb_cache_create(name, HASH_BUCKET_BITS); ++ return mb2_cache_create(HASH_BUCKET_BITS); + } + +-void ext4_xattr_destroy_cache(struct mb_cache *cache) ++void ext4_xattr_destroy_cache(struct mb2_cache *cache) + { + if (cache) +- mb_cache_destroy(cache); ++ mb2_cache_destroy(cache); + } + +diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h +index ddc0957760ba..10b0f7323ed6 100644 +--- a/fs/ext4/xattr.h ++++ b/fs/ext4/xattr.h +@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_ + extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + + extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +-extern void ext4_xattr_put_super(struct super_block *); + + extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); +@@ -124,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + +-extern struct mb_cache *ext4_xattr_create_cache(char *name); +-extern void ext4_xattr_destroy_cache(struct mb_cache *); ++extern struct mb2_cache *ext4_xattr_create_cache(void); ++extern void ext4_xattr_destroy_cache(struct mb2_cache *); + + #ifdef CONFIG_EXT4_FS_SECURITY + extern int ext4_init_security(handle_t *handle, struct inode *inode, +-- +2.1.4 + + diff --git a/mbcache2-limit-cache-size b/mbcache2-limit-cache-size new file mode 100644 index 00000000..83c6f568 --- /dev/null +++ b/mbcache2-limit-cache-size @@ -0,0 +1,139 @@ +mbcache2: limit cache size + +From: Jan Kara + +So far number of entries in mbcache is limited only by the pressure from +the shrinker. Since too many entries degrade the hash table and +generally we expect that caching more entries has diminishing returns, +limit number of entries the same way as in the old mbcache to 16 * hash +table size. + +Once we exceed the desired maximum number of entries, we schedule a +backround work to reclaim entries. If the background work cannot keep up +and the number of entries exceeds two times the desired maximum, we +reclaim some entries directly when allocating a new entry. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/mbcache2.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 45 insertions(+), 5 deletions(-) + +diff --git a/fs/mbcache2.c b/fs/mbcache2.c +index 4ccf0752c6d1..fe9f6f6a2953 100644 +--- a/fs/mbcache2.c ++++ b/fs/mbcache2.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -21,16 +22,29 @@ struct mb2_cache { + struct hlist_bl_head *c_hash; + /* log2 of hash table size */ + int c_bucket_bits; ++ /* Maximum entries in cache to avoid degrading hash too much */ ++ int c_max_entries; + /* Protects c_lru_list, c_entry_count */ + spinlock_t c_lru_list_lock; + struct list_head c_lru_list; + /* Number of entries in cache */ + unsigned long c_entry_count; + struct shrinker c_shrink; ++ /* Work for shrinking when the cache has too many entries */ ++ struct work_struct c_shrink_work; + }; + + static struct kmem_cache *mb2_entry_cache; + ++static unsigned long mb2_cache_shrink(struct mb2_cache *cache, ++ unsigned int nr_to_scan); ++ ++/* ++ * Number of entries to reclaim synchronously when there are too many entries ++ * in cache ++ */ ++#define SYNC_SHRINK_BATCH 64 ++ + /* + * mb2_cache_entry_create - create entry in cache + * @cache - cache where the entry should be created +@@ -52,6 +66,13 @@ struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache, + struct hlist_bl_node *dup_node; + struct hlist_bl_head *head; + ++ /* Schedule background reclaim if there are too many entries */ ++ if (cache->c_entry_count >= cache->c_max_entries) ++ schedule_work(&cache->c_shrink_work); ++ /* Do some sync reclaim if background reclaim cannot keep up */ ++ if (cache->c_entry_count >= 2*cache->c_max_entries) ++ mb2_cache_shrink(cache, SYNC_SHRINK_BATCH); ++ + entry = kmem_cache_alloc(mb2_entry_cache, mask); + if (!entry) + return ERR_PTR(-ENOMEM); +@@ -252,12 +273,9 @@ static unsigned long mb2_cache_count(struct shrinker *shrink, + } + + /* Shrink number of entries in cache */ +-static unsigned long mb2_cache_scan(struct shrinker *shrink, +- struct shrink_control *sc) ++static unsigned long mb2_cache_shrink(struct mb2_cache *cache, ++ unsigned int nr_to_scan) + { +- int nr_to_scan = sc->nr_to_scan; +- struct mb2_cache *cache = container_of(shrink, struct mb2_cache, +- c_shrink); + struct mb2_cache_entry *entry; + struct hlist_bl_head *head; + unsigned int shrunk = 0; +@@ -290,6 +308,25 @@ static unsigned long mb2_cache_scan(struct shrinker *shrink, + return shrunk; + } + ++static unsigned long mb2_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ int nr_to_scan = sc->nr_to_scan; ++ struct mb2_cache *cache = container_of(shrink, struct mb2_cache, ++ c_shrink); ++ return mb2_cache_shrink(cache, nr_to_scan); ++} ++ ++/* We shrink 1/X of the cache when we have too many entries in it */ ++#define SHRINK_DIVISOR 16 ++ ++static void mb2_cache_shrink_worker(struct work_struct *work) ++{ ++ struct mb2_cache *cache = container_of(work, struct mb2_cache, ++ c_shrink_work); ++ mb2_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); ++} ++ + /* + * mb2_cache_create - create cache + * @bucket_bits: log2 of the hash table size +@@ -309,6 +346,7 @@ struct mb2_cache *mb2_cache_create(int bucket_bits) + if (!cache) + goto err_out; + cache->c_bucket_bits = bucket_bits; ++ cache->c_max_entries = bucket_count << 4; + INIT_LIST_HEAD(&cache->c_lru_list); + spin_lock_init(&cache->c_lru_list_lock); + cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), +@@ -325,6 +363,8 @@ struct mb2_cache *mb2_cache_create(int bucket_bits) + cache->c_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&cache->c_shrink); + ++ INIT_WORK(&cache->c_shrink_work, mb2_cache_shrink_worker); ++ + return cache; + + err_out: +-- +2.1.4 + + diff --git a/mbcache2-use-referenced-bit-instead-of-LRU b/mbcache2-use-referenced-bit-instead-of-LRU new file mode 100644 index 00000000..1711ffdc --- /dev/null +++ b/mbcache2-use-referenced-bit-instead-of-LRU @@ -0,0 +1,139 @@ +mbcache2: use referenced bit instead of LRU + +From: Jan Kara + +Currently we maintain perfect LRU list by moving entry to the tail of +the list when it gets used. However these operations on cache-global +list are relatively expensive. + +In this patch we switch to lazy updates of LRU list. Whenever entry gets +used, we set a referenced bit in it. When reclaiming entries, we give +referenced entries another round in the LRU. + +In my testing this logic gives about 30% boost to workloads with mostly +unique xattr blocks (e.g. xattr-bench with 10 files and 10000 unique +xattr values). + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/mbcache2.c | 41 +++++++++++++++++++++++++++++++++-------- + include/linux/mbcache2.h | 7 +++++-- + 2 files changed, 38 insertions(+), 10 deletions(-) + +diff --git a/fs/mbcache2.c b/fs/mbcache2.c +index fe9f6f6a2953..60310a690f8d 100644 +--- a/fs/mbcache2.c ++++ b/fs/mbcache2.c +@@ -39,6 +39,29 @@ static struct kmem_cache *mb2_entry_cache; + static unsigned long mb2_cache_shrink(struct mb2_cache *cache, + unsigned int nr_to_scan); + ++static inline bool mb2_cache_entry_referenced(struct mb2_cache_entry *entry) ++{ ++ return entry->_e_hash_list_head & 1; ++} ++ ++static inline void mb2_cache_entry_set_referenced(struct mb2_cache_entry *entry) ++{ ++ entry->_e_hash_list_head |= 1; ++} ++ ++static inline void mb2_cache_entry_clear_referenced( ++ struct mb2_cache_entry *entry) ++{ ++ entry->_e_hash_list_head &= ~1; ++} ++ ++static inline struct hlist_bl_head *mb2_cache_entry_head( ++ struct mb2_cache_entry *entry) ++{ ++ return (struct hlist_bl_head *) ++ (entry->_e_hash_list_head & ~1); ++} ++ + /* + * Number of entries to reclaim synchronously when there are too many entries + * in cache +@@ -83,7 +106,7 @@ struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache, + entry->e_key = key; + entry->e_block = block; + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; +- entry->e_hash_list_head = head; ++ entry->_e_hash_list_head = (unsigned long)head; + hlist_bl_lock(head); + hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { + if (dup->e_key == key && dup->e_block == block) { +@@ -125,7 +148,7 @@ EXPORT_SYMBOL(__mb2_cache_entry_free); + void mb2_cache_entry_delete(struct mb2_cache *cache, + struct mb2_cache_entry *entry) + { +- struct hlist_bl_head *head = entry->e_hash_list_head; ++ struct hlist_bl_head *head = mb2_cache_entry_head(entry); + + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { +@@ -153,7 +176,7 @@ static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, + struct hlist_bl_head *head; + + if (entry) +- head = entry->e_hash_list_head; ++ head = mb2_cache_entry_head(entry); + else + head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; + hlist_bl_lock(head); +@@ -256,10 +279,7 @@ EXPORT_SYMBOL(mb2_cache_entry_delete_block); + void mb2_cache_entry_touch(struct mb2_cache *cache, + struct mb2_cache_entry *entry) + { +- spin_lock(&cache->c_lru_list_lock); +- if (!list_empty(&entry->e_lru_list)) +- list_move_tail(&cache->c_lru_list, &entry->e_lru_list); +- spin_unlock(&cache->c_lru_list_lock); ++ mb2_cache_entry_set_referenced(entry); + } + EXPORT_SYMBOL(mb2_cache_entry_touch); + +@@ -284,6 +304,11 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache, + while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) { + entry = list_first_entry(&cache->c_lru_list, + struct mb2_cache_entry, e_lru_list); ++ if (mb2_cache_entry_referenced(entry)) { ++ mb2_cache_entry_clear_referenced(entry); ++ list_move_tail(&cache->c_lru_list, &entry->e_lru_list); ++ continue; ++ } + list_del_init(&entry->e_lru_list); + cache->c_entry_count--; + /* +@@ -291,7 +316,7 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache, + * from under us. + */ + spin_unlock(&cache->c_lru_list_lock); +- head = entry->e_hash_list_head; ++ head = mb2_cache_entry_head(entry); + hlist_bl_lock(head); + if (!hlist_bl_unhashed(&entry->e_hash_list)) { + hlist_bl_del_init(&entry->e_hash_list); +diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h +index 2a58c51c3a0a..ca5b509c14a8 100644 +--- a/include/linux/mbcache2.h ++++ b/include/linux/mbcache2.h +@@ -19,8 +19,11 @@ struct mb2_cache_entry { + unsigned int e_key; + /* Block number of hashed block - stable during lifetime of the entry */ + sector_t e_block; +- /* Head of hash list (for list bit lock) - stable */ +- struct hlist_bl_head *e_hash_list_head; ++ /* ++ * Head of hash list (for list bit lock) - stable. Combined with ++ * referenced bit of entry ++ */ ++ unsigned long _e_hash_list_head; + }; + + struct mb2_cache *mb2_cache_create(int bucket_bits); +-- +2.1.4 + + diff --git a/reimplement-mbcache b/reimplement-mbcache new file mode 100644 index 00000000..145ab695 --- /dev/null +++ b/reimplement-mbcache @@ -0,0 +1,541 @@ +mbcache2: reimplement mbcache + +From: Jan Kara + +Original mbcache was designed to have more features than what ext? +filesystems ended up using. It supported entry being in more hashes, it +had a home-grown rwlocking of each entry, and one cache could cache +entries from multiple filesystems. This genericity also resulted in more +complex locking, larger cache entries, and generally more code +complexity. + +This is reimplementation of the mbcache functionality to exactly fit the +purpose ext? filesystems use it for. Cache entries are now considerably +smaller (7 instead of 13 longs), the code is considerably smaller as +well (432 vs 913 lines of code), and IMO also simpler. The new code is +also much more lightweight. + +I have measured the speed using artificial xattr-bench benchmark, which +spawns P processes, each process sets xattr for F different files, and +the value of xattr is randomly chosen from a pool of V values. Averages +of runtimes for 5 runs for various combinations of parameters are below. +The first value in each cell is old mbache, the second value is the new +mbcache. + +V=10 +F\P 1 2 4 8 16 32 64 +10 0.158,0.157 0.208,0.196 0.500,0.277 0.798,0.400 3.258,0.584 13.807,1.047 61.339,2.803 +100 0.172,0.167 0.279,0.222 0.520,0.275 0.825,0.341 2.981,0.505 12.022,1.202 44.641,2.943 +1000 0.185,0.174 0.297,0.239 0.445,0.283 0.767,0.340 2.329,0.480 6.342,1.198 16.440,3.888 + +V=100 +F\P 1 2 4 8 16 32 64 +10 0.162,0.153 0.200,0.186 0.362,0.257 0.671,0.496 1.433,0.943 3.801,1.345 7.938,2.501 +100 0.153,0.160 0.221,0.199 0.404,0.264 0.945,0.379 1.556,0.485 3.761,1.156 7.901,2.484 +1000 0.215,0.191 0.303,0.246 0.471,0.288 0.960,0.347 1.647,0.479 3.916,1.176 8.058,3.160 + +V=1000 +F\P 1 2 4 8 16 32 64 +10 0.151,0.129 0.210,0.163 0.326,0.245 0.685,0.521 1.284,0.859 3.087,2.251 6.451,4.801 +100 0.154,0.153 0.211,0.191 0.276,0.282 0.687,0.506 1.202,0.877 3.259,1.954 8.738,2.887 +1000 0.145,0.179 0.202,0.222 0.449,0.319 0.899,0.333 1.577,0.524 4.221,1.240 9.782,3.579 + +V=10000 +F\P 1 2 4 8 16 32 64 +10 0.161,0.154 0.198,0.190 0.296,0.256 0.662,0.480 1.192,0.818 2.989,2.200 6.362,4.746 +100 0.176,0.174 0.236,0.203 0.326,0.255 0.696,0.511 1.183,0.855 4.205,3.444 19.510,17.760 +1000 0.199,0.183 0.240,0.227 1.159,1.014 2.286,2.154 6.023,6.039 10.933,--- 36.620,--- + +V=100000 +F\P 1 2 4 8 16 32 64 +10 0.171,0.162 0.204,0.198 0.285,0.230 0.692,0.500 1.225,0.881 2.990,2.243 6.379,4.771 +100 0.151,0.171 0.220,0.210 0.295,0.255 0.720,0.518 1.226,0.844 3.423,2.831 19.234,17.544 +1000 0.192,0.189 0.249,0.225 1.162,1.043 2.257,2.093 5.853,4.997 10.399,--- 32.198,--- + +We see that the new code is faster in pretty much all the cases and +starting from 4 processes there are significant gains with the new code +resulting in upto 20-times shorter runtimes. Also for large numbers of +cached entries all values for the old code could not be measured as the +kernel started hitting softlockups and died before the test completed. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/Makefile | 2 +- + fs/mbcache2.c | 388 +++++++++++++++++++++++++++++++++++++++++++++++ + include/linux/mbcache2.h | 54 +++++++ + 3 files changed, 443 insertions(+), 1 deletion(-) + create mode 100644 fs/mbcache2.c + create mode 100644 include/linux/mbcache2.h + +diff --git a/fs/Makefile b/fs/Makefile +index 79f522575cba..15b3d6c4e46a 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o + obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o + obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o + +-obj-$(CONFIG_FS_MBCACHE) += mbcache.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache.o mbcache2.o + obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o + obj-$(CONFIG_NFS_COMMON) += nfs_common/ + obj-$(CONFIG_COREDUMP) += coredump.o +diff --git a/fs/mbcache2.c b/fs/mbcache2.c +new file mode 100644 +index 000000000000..4ccf0752c6d1 +--- /dev/null ++++ b/fs/mbcache2.c +@@ -0,0 +1,388 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Mbcache is a simple key-value store. Keys need not be unique, however ++ * key-value pairs are expected to be unique (we use this in ++ * mb2_cache_entry_delete_block()). ++ * ++ * We provide functions for creation and removal of entries, search by key, ++ * and a special "delete entry with given key-value pair" operation. Fixed ++ * size hash table is used for fast key lookups. ++ */ ++ ++struct mb2_cache { ++ /* Hash table of entries */ ++ struct hlist_bl_head *c_hash; ++ /* log2 of hash table size */ ++ int c_bucket_bits; ++ /* Protects c_lru_list, c_entry_count */ ++ spinlock_t c_lru_list_lock; ++ struct list_head c_lru_list; ++ /* Number of entries in cache */ ++ unsigned long c_entry_count; ++ struct shrinker c_shrink; ++}; ++ ++static struct kmem_cache *mb2_entry_cache; ++ ++/* ++ * mb2_cache_entry_create - create entry in cache ++ * @cache - cache where the entry should be created ++ * @mask - gfp mask with which the entry should be allocated ++ * @key - key of the entry ++ * @block - block that contains data ++ * ++ * Creates entry in @cache with key @key and records that data is stored in ++ * block @block. The function returns -EBUSY if entry with the same key ++ * and for the same block already exists in cache. Otherwise reference to ++ * the created entry is returned. ++ */ ++struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache, ++ gfp_t mask, ++ unsigned int key, ++ sector_t block) ++{ ++ struct mb2_cache_entry *entry, *dup; ++ struct hlist_bl_node *dup_node; ++ struct hlist_bl_head *head; ++ ++ entry = kmem_cache_alloc(mb2_entry_cache, mask); ++ if (!entry) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&entry->e_lru_list); ++ /* One ref for hash, one ref returned */ ++ atomic_set(&entry->e_refcnt, 2); ++ entry->e_key = key; ++ entry->e_block = block; ++ head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; ++ entry->e_hash_list_head = head; ++ hlist_bl_lock(head); ++ hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { ++ if (dup->e_key == key && dup->e_block == block) { ++ hlist_bl_unlock(head); ++ kmem_cache_free(mb2_entry_cache, entry); ++ return ERR_PTR(-EBUSY); ++ } ++ } ++ hlist_bl_add_head(&entry->e_hash_list, head); ++ hlist_bl_unlock(head); ++ ++ spin_lock(&cache->c_lru_list_lock); ++ list_add_tail(&entry->e_lru_list, &cache->c_lru_list); ++ /* Grab ref for LRU list */ ++ atomic_inc(&entry->e_refcnt); ++ cache->c_entry_count++; ++ spin_unlock(&cache->c_lru_list_lock); ++ ++ return entry; ++} ++EXPORT_SYMBOL(mb2_cache_entry_create); ++ ++void __mb2_cache_entry_free(struct mb2_cache_entry *entry) ++{ ++ kmem_cache_free(mb2_entry_cache, entry); ++} ++EXPORT_SYMBOL(__mb2_cache_entry_free); ++ ++/* ++ * mb2_cache_entry_delete - delete entry from cache ++ * @cache - cache where the entry is ++ * @entry - entry to delete ++ * ++ * Delete entry from cache. The entry is unhashed and deleted from the lru list ++ * so it cannot be found. We also drop the reference to @entry caller gave us. ++ * However entry need not be freed if there's someone else still holding a ++ * reference to it. Freeing happens when the last reference is dropped. ++ */ ++void mb2_cache_entry_delete(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry) ++{ ++ struct hlist_bl_head *head = entry->e_hash_list_head; ++ ++ hlist_bl_lock(head); ++ if (!hlist_bl_unhashed(&entry->e_hash_list)) { ++ hlist_bl_del_init(&entry->e_hash_list); ++ atomic_dec(&entry->e_refcnt); ++ } ++ hlist_bl_unlock(head); ++ spin_lock(&cache->c_lru_list_lock); ++ if (!list_empty(&entry->e_lru_list)) { ++ list_del_init(&entry->e_lru_list); ++ cache->c_entry_count--; ++ atomic_dec(&entry->e_refcnt); ++ } ++ spin_unlock(&cache->c_lru_list_lock); ++ mb2_cache_entry_put(cache, entry); ++} ++EXPORT_SYMBOL(mb2_cache_entry_delete); ++ ++static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry, ++ unsigned int key) ++{ ++ struct mb2_cache_entry *old_entry = entry; ++ struct hlist_bl_node *node; ++ struct hlist_bl_head *head; ++ ++ if (entry) ++ head = entry->e_hash_list_head; ++ else ++ head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; ++ hlist_bl_lock(head); ++ if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) ++ node = entry->e_hash_list.next; ++ else ++ node = hlist_bl_first(head); ++ while (node) { ++ entry = hlist_bl_entry(node, struct mb2_cache_entry, ++ e_hash_list); ++ if (entry->e_key == key) { ++ atomic_inc(&entry->e_refcnt); ++ goto out; ++ } ++ node = node->next; ++ } ++ entry = NULL; ++out: ++ hlist_bl_unlock(head); ++ if (old_entry) ++ mb2_cache_entry_put(cache, old_entry); ++ ++ return entry; ++} ++ ++/* ++ * mb2_cache_entry_find_first - find the first entry in cache with given key ++ * @cache: cache where we should search ++ * @key: key to look for ++ * ++ * Search in @cache for entry with key @key. Grabs reference to the first ++ * entry found and returns the entry. ++ */ ++struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, ++ unsigned int key) ++{ ++ return __entry_find(cache, NULL, key); ++} ++EXPORT_SYMBOL(mb2_cache_entry_find_first); ++ ++/* ++ * mb2_cache_entry_find_next - find next entry in cache with the same ++ * @cache: cache where we should search ++ * @entry: entry to start search from ++ * ++ * Finds next entry in the hash chain which has the same key as @entry. ++ * If @entry is unhashed (which can happen when deletion of entry races ++ * with the search), finds the first entry in the hash chain. The function ++ * drops reference to @entry and returns with a reference to the found entry. ++ */ ++struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry) ++{ ++ return __entry_find(cache, entry, entry->e_key); ++} ++EXPORT_SYMBOL(mb2_cache_entry_find_next); ++ ++/* mb2_cache_entry_delete_block - remove information about block from cache ++ * @cache - cache we work with ++ * @key - key of the entry to remove ++ * @block - block containing data for @key ++ * ++ * Remove entry from cache @cache with key @key with data stored in @block. ++ */ ++void mb2_cache_entry_delete_block(struct mb2_cache *cache, unsigned int key, ++ sector_t block) ++{ ++ struct hlist_bl_node *node; ++ struct hlist_bl_head *head; ++ struct mb2_cache_entry *entry; ++ ++ head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; ++ hlist_bl_lock(head); ++ hlist_bl_for_each_entry(entry, node, head, e_hash_list) { ++ if (entry->e_key == key && entry->e_block == block) { ++ /* We keep hash list reference to keep entry alive */ ++ hlist_bl_del_init(&entry->e_hash_list); ++ hlist_bl_unlock(head); ++ spin_lock(&cache->c_lru_list_lock); ++ if (!list_empty(&entry->e_lru_list)) { ++ list_del_init(&entry->e_lru_list); ++ cache->c_entry_count--; ++ atomic_dec(&entry->e_refcnt); ++ } ++ spin_unlock(&cache->c_lru_list_lock); ++ mb2_cache_entry_put(cache, entry); ++ return; ++ } ++ } ++ hlist_bl_unlock(head); ++} ++EXPORT_SYMBOL(mb2_cache_entry_delete_block); ++ ++/* mb2_cache_entry_touch - cache entry got used ++ * @cache - cache the entry belongs to ++ * @entry - entry that got used ++ * ++ * Move entry in lru list to reflect the fact that it was used. ++ */ ++void mb2_cache_entry_touch(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry) ++{ ++ spin_lock(&cache->c_lru_list_lock); ++ if (!list_empty(&entry->e_lru_list)) ++ list_move_tail(&cache->c_lru_list, &entry->e_lru_list); ++ spin_unlock(&cache->c_lru_list_lock); ++} ++EXPORT_SYMBOL(mb2_cache_entry_touch); ++ ++static unsigned long mb2_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct mb2_cache *cache = container_of(shrink, struct mb2_cache, ++ c_shrink); ++ ++ return cache->c_entry_count; ++} ++ ++/* Shrink number of entries in cache */ ++static unsigned long mb2_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ int nr_to_scan = sc->nr_to_scan; ++ struct mb2_cache *cache = container_of(shrink, struct mb2_cache, ++ c_shrink); ++ struct mb2_cache_entry *entry; ++ struct hlist_bl_head *head; ++ unsigned int shrunk = 0; ++ ++ spin_lock(&cache->c_lru_list_lock); ++ while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) { ++ entry = list_first_entry(&cache->c_lru_list, ++ struct mb2_cache_entry, e_lru_list); ++ list_del_init(&entry->e_lru_list); ++ cache->c_entry_count--; ++ /* ++ * We keep LRU list reference so that entry doesn't go away ++ * from under us. ++ */ ++ spin_unlock(&cache->c_lru_list_lock); ++ head = entry->e_hash_list_head; ++ hlist_bl_lock(head); ++ if (!hlist_bl_unhashed(&entry->e_hash_list)) { ++ hlist_bl_del_init(&entry->e_hash_list); ++ atomic_dec(&entry->e_refcnt); ++ } ++ hlist_bl_unlock(head); ++ if (mb2_cache_entry_put(cache, entry)) ++ shrunk++; ++ cond_resched(); ++ spin_lock(&cache->c_lru_list_lock); ++ } ++ spin_unlock(&cache->c_lru_list_lock); ++ ++ return shrunk; ++} ++ ++/* ++ * mb2_cache_create - create cache ++ * @bucket_bits: log2 of the hash table size ++ * ++ * Create cache for keys with 2^bucket_bits hash entries. ++ */ ++struct mb2_cache *mb2_cache_create(int bucket_bits) ++{ ++ struct mb2_cache *cache; ++ int bucket_count = 1 << bucket_bits; ++ int i; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return NULL; ++ ++ cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL); ++ if (!cache) ++ goto err_out; ++ cache->c_bucket_bits = bucket_bits; ++ INIT_LIST_HEAD(&cache->c_lru_list); ++ spin_lock_init(&cache->c_lru_list_lock); ++ cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), ++ GFP_KERNEL); ++ if (!cache->c_hash) { ++ kfree(cache); ++ goto err_out; ++ } ++ for (i = 0; i < bucket_count; i++) ++ INIT_HLIST_BL_HEAD(&cache->c_hash[i]); ++ ++ cache->c_shrink.count_objects = mb2_cache_count; ++ cache->c_shrink.scan_objects = mb2_cache_scan; ++ cache->c_shrink.seeks = DEFAULT_SEEKS; ++ register_shrinker(&cache->c_shrink); ++ ++ return cache; ++ ++err_out: ++ module_put(THIS_MODULE); ++ return NULL; ++} ++EXPORT_SYMBOL(mb2_cache_create); ++ ++/* ++ * mb2_cache_destroy - destroy cache ++ * @cache: the cache to destroy ++ * ++ * Free all entries in cache and cache itself. Caller must make sure nobody ++ * (except shrinker) can reach @cache when calling this. ++ */ ++void mb2_cache_destroy(struct mb2_cache *cache) ++{ ++ struct mb2_cache_entry *entry, *next; ++ ++ unregister_shrinker(&cache->c_shrink); ++ ++ /* ++ * We don't bother with any locking. Cache must not be used at this ++ * point. ++ */ ++ list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) { ++ if (!hlist_bl_unhashed(&entry->e_hash_list)) { ++ hlist_bl_del_init(&entry->e_hash_list); ++ atomic_dec(&entry->e_refcnt); ++ } else ++ WARN_ON(1); ++ list_del(&entry->e_lru_list); ++ WARN_ON(atomic_read(&entry->e_refcnt) != 1); ++ mb2_cache_entry_put(cache, entry); ++ } ++ kfree(cache->c_hash); ++ kfree(cache); ++ module_put(THIS_MODULE); ++} ++EXPORT_SYMBOL(mb2_cache_destroy); ++ ++static int __init mb2cache_init(void) ++{ ++ mb2_entry_cache = kmem_cache_create("mbcache", ++ sizeof(struct mb2_cache_entry), 0, ++ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); ++ BUG_ON(!mb2_entry_cache); ++ return 0; ++} ++ ++static void __exit mb2cache_exit(void) ++{ ++ kmem_cache_destroy(mb2_entry_cache); ++} ++ ++module_init(mb2cache_init) ++module_exit(mb2cache_exit) ++ ++MODULE_AUTHOR("Jan Kara "); ++MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); ++MODULE_LICENSE("GPL"); +diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h +new file mode 100644 +index 000000000000..2a58c51c3a0a +--- /dev/null ++++ b/include/linux/mbcache2.h +@@ -0,0 +1,54 @@ ++#ifndef _LINUX_MB2CACHE_H ++#define _LINUX_MB2CACHE_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++struct mb2_cache; ++ ++struct mb2_cache_entry { ++ /* LRU list - protected by cache->c_lru_list_lock */ ++ struct list_head e_lru_list; ++ /* Hash table list - protected by bitlock in e_hash_list_head */ ++ struct hlist_bl_node e_hash_list; ++ atomic_t e_refcnt; ++ /* Key in hash - stable during lifetime of the entry */ ++ unsigned int e_key; ++ /* Block number of hashed block - stable during lifetime of the entry */ ++ sector_t e_block; ++ /* Head of hash list (for list bit lock) - stable */ ++ struct hlist_bl_head *e_hash_list_head; ++}; ++ ++struct mb2_cache *mb2_cache_create(int bucket_bits); ++void mb2_cache_destroy(struct mb2_cache *cache); ++ ++struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache, ++ gfp_t mask, ++ unsigned int key, ++ sector_t block); ++void mb2_cache_entry_delete(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry); ++void __mb2_cache_entry_free(struct mb2_cache_entry *entry); ++static inline int mb2_cache_entry_put(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry) ++{ ++ if (!atomic_dec_and_test(&entry->e_refcnt)) ++ return 0; ++ __mb2_cache_entry_free(entry); ++ return 1; ++} ++ ++void mb2_cache_entry_delete_block(struct mb2_cache *cache, unsigned int key, ++ sector_t block); ++struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache, ++ unsigned int key); ++struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry); ++void mb2_cache_entry_touch(struct mb2_cache *cache, ++ struct mb2_cache_entry *entry); ++ ++#endif /* _LINUX_MB2CACHE_H */ +-- +2.1.4 + + diff --git a/remove-mbcache b/remove-mbcache new file mode 100644 index 00000000..18cbd236 --- /dev/null +++ b/remove-mbcache @@ -0,0 +1,959 @@ +mbcache: remove + +From: Jan Kara + +Both ext2 and ext4 are now converted to mbcache2. Remove the old mbcache +code. + +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/Makefile | 2 +- + fs/mbcache.c | 858 ------------------------------------------------ + include/linux/mbcache.h | 55 ---- + 3 files changed, 1 insertion(+), 914 deletions(-) + delete mode 100644 fs/mbcache.c + delete mode 100644 include/linux/mbcache.h + +diff --git a/fs/Makefile b/fs/Makefile +index 15b3d6c4e46a..59b844007fbc 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o + obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o + obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o + +-obj-$(CONFIG_FS_MBCACHE) += mbcache.o mbcache2.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache2.o + obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o + obj-$(CONFIG_NFS_COMMON) += nfs_common/ + obj-$(CONFIG_COREDUMP) += coredump.o +diff --git a/fs/mbcache.c b/fs/mbcache.c +deleted file mode 100644 +index 187477ded6b3..000000000000 +--- a/fs/mbcache.c ++++ /dev/null +@@ -1,858 +0,0 @@ +-/* +- * linux/fs/mbcache.c +- * (C) 2001-2002 Andreas Gruenbacher, +- */ +- +-/* +- * Filesystem Meta Information Block Cache (mbcache) +- * +- * The mbcache caches blocks of block devices that need to be located +- * by their device/block number, as well as by other criteria (such +- * as the block's contents). +- * +- * There can only be one cache entry in a cache per device and block number. +- * Additional indexes need not be unique in this sense. The number of +- * additional indexes (=other criteria) can be hardwired at compile time +- * or specified at cache create time. +- * +- * Each cache entry is of fixed size. An entry may be `valid' or `invalid' +- * in the cache. A valid entry is in the main hash tables of the cache, +- * and may also be in the lru list. An invalid entry is not in any hashes +- * or lists. +- * +- * A valid cache entry is only in the lru list if no handles refer to it. +- * Invalid cache entries will be freed when the last handle to the cache +- * entry is released. Entries that cannot be freed immediately are put +- * back on the lru list. +- */ +- +-/* +- * Lock descriptions and usage: +- * +- * Each hash chain of both the block and index hash tables now contains +- * a built-in lock used to serialize accesses to the hash chain. +- * +- * Accesses to global data structures mb_cache_list and mb_cache_lru_list +- * are serialized via the global spinlock mb_cache_spinlock. +- * +- * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize +- * accesses to its local data, such as e_used and e_queued. +- * +- * Lock ordering: +- * +- * Each block hash chain's lock has the highest lock order, followed by an +- * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's +- * lock), and mb_cach_spinlock, with the lowest order. While holding +- * either a block or index hash chain lock, a thread can acquire an +- * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. +- * +- * Synchronization: +- * +- * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and +- * index hash chian, it needs to lock the corresponding hash chain. For each +- * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to +- * prevent either any simultaneous release or free on the entry and also +- * to serialize accesses to either the e_used or e_queued member of the entry. +- * +- * To avoid having a dangling reference to an already freed +- * mb_cache_entry, an mb_cache_entry is only freed when it is not on a +- * block hash chain and also no longer being referenced, both e_used, +- * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is +- * first removed from a block hash chain. +- */ +- +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef MB_CACHE_DEBUG +-# define mb_debug(f...) do { \ +- printk(KERN_DEBUG f); \ +- printk("\n"); \ +- } while (0) +-#define mb_assert(c) do { if (!(c)) \ +- printk(KERN_ERR "assertion " #c " failed\n"); \ +- } while(0) +-#else +-# define mb_debug(f...) do { } while(0) +-# define mb_assert(c) do { } while(0) +-#endif +-#define mb_error(f...) do { \ +- printk(KERN_ERR f); \ +- printk("\n"); \ +- } while(0) +- +-#define MB_CACHE_WRITER ((unsigned short)~0U >> 1) +- +-#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) +-#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ +- (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) +- +-static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); +-static struct blockgroup_lock *mb_cache_bg_lock; +-static struct kmem_cache *mb_cache_kmem_cache; +- +-MODULE_AUTHOR("Andreas Gruenbacher "); +-MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); +-MODULE_LICENSE("GPL"); +- +-EXPORT_SYMBOL(mb_cache_create); +-EXPORT_SYMBOL(mb_cache_shrink); +-EXPORT_SYMBOL(mb_cache_destroy); +-EXPORT_SYMBOL(mb_cache_entry_alloc); +-EXPORT_SYMBOL(mb_cache_entry_insert); +-EXPORT_SYMBOL(mb_cache_entry_release); +-EXPORT_SYMBOL(mb_cache_entry_free); +-EXPORT_SYMBOL(mb_cache_entry_get); +-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) +-EXPORT_SYMBOL(mb_cache_entry_find_first); +-EXPORT_SYMBOL(mb_cache_entry_find_next); +-#endif +- +-/* +- * Global data: list of all mbcache's, lru list, and a spinlock for +- * accessing cache data structures on SMP machines. The lru list is +- * global across all mbcaches. +- */ +- +-static LIST_HEAD(mb_cache_list); +-static LIST_HEAD(mb_cache_lru_list); +-static DEFINE_SPINLOCK(mb_cache_spinlock); +- +-static inline void +-__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) +-{ +- spin_lock(bgl_lock_ptr(mb_cache_bg_lock, +- MB_CACHE_ENTRY_LOCK_INDEX(ce))); +-} +- +-static inline void +-__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) +-{ +- spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, +- MB_CACHE_ENTRY_LOCK_INDEX(ce))); +-} +- +-static inline int +-__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) +-{ +- return !hlist_bl_unhashed(&ce->e_block_list); +-} +- +- +-static inline void +-__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) +-{ +- if (__mb_cache_entry_is_block_hashed(ce)) +- hlist_bl_del_init(&ce->e_block_list); +-} +- +-static inline int +-__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +-{ +- return !hlist_bl_unhashed(&ce->e_index.o_list); +-} +- +-static inline void +-__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) +-{ +- if (__mb_cache_entry_is_index_hashed(ce)) +- hlist_bl_del_init(&ce->e_index.o_list); +-} +- +-/* +- * __mb_cache_entry_unhash_unlock() +- * +- * This function is called to unhash both the block and index hash +- * chain. +- * It assumes both the block and index hash chain is locked upon entry. +- * It also unlock both hash chains both exit +- */ +-static inline void +-__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) +-{ +- __mb_cache_entry_unhash_index(ce); +- hlist_bl_unlock(ce->e_index_hash_p); +- __mb_cache_entry_unhash_block(ce); +- hlist_bl_unlock(ce->e_block_hash_p); +-} +- +-static void +-__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) +-{ +- struct mb_cache *cache = ce->e_cache; +- +- mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); +- kmem_cache_free(cache->c_entry_cache, ce); +- atomic_dec(&cache->c_entry_count); +-} +- +-static void +-__mb_cache_entry_release(struct mb_cache_entry *ce) +-{ +- /* First lock the entry to serialize access to its local data. */ +- __spin_lock_mb_cache_entry(ce); +- /* Wake up all processes queuing for this cache entry. */ +- if (ce->e_queued) +- wake_up_all(&mb_cache_queue); +- if (ce->e_used >= MB_CACHE_WRITER) +- ce->e_used -= MB_CACHE_WRITER; +- /* +- * Make sure that all cache entries on lru_list have +- * both e_used and e_qued of 0s. +- */ +- ce->e_used--; +- if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { +- if (!__mb_cache_entry_is_block_hashed(ce)) { +- __spin_unlock_mb_cache_entry(ce); +- goto forget; +- } +- /* +- * Need access to lru list, first drop entry lock, +- * then reacquire the lock in the proper order. +- */ +- spin_lock(&mb_cache_spinlock); +- if (list_empty(&ce->e_lru_list)) +- list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); +- spin_unlock(&mb_cache_spinlock); +- } +- __spin_unlock_mb_cache_entry(ce); +- return; +-forget: +- mb_assert(list_empty(&ce->e_lru_list)); +- __mb_cache_entry_forget(ce, GFP_KERNEL); +-} +- +-/* +- * mb_cache_shrink_scan() memory pressure callback +- * +- * This function is called by the kernel memory management when memory +- * gets low. +- * +- * @shrink: (ignored) +- * @sc: shrink_control passed from reclaim +- * +- * Returns the number of objects freed. +- */ +-static unsigned long +-mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +-{ +- LIST_HEAD(free_list); +- struct mb_cache_entry *entry, *tmp; +- int nr_to_scan = sc->nr_to_scan; +- gfp_t gfp_mask = sc->gfp_mask; +- unsigned long freed = 0; +- +- mb_debug("trying to free %d entries", nr_to_scan); +- spin_lock(&mb_cache_spinlock); +- while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { +- struct mb_cache_entry *ce = +- list_entry(mb_cache_lru_list.next, +- struct mb_cache_entry, e_lru_list); +- list_del_init(&ce->e_lru_list); +- if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) +- continue; +- spin_unlock(&mb_cache_spinlock); +- /* Prevent any find or get operation on the entry */ +- hlist_bl_lock(ce->e_block_hash_p); +- hlist_bl_lock(ce->e_index_hash_p); +- /* Ignore if it is touched by a find/get */ +- if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || +- !list_empty(&ce->e_lru_list)) { +- hlist_bl_unlock(ce->e_index_hash_p); +- hlist_bl_unlock(ce->e_block_hash_p); +- spin_lock(&mb_cache_spinlock); +- continue; +- } +- __mb_cache_entry_unhash_unlock(ce); +- list_add_tail(&ce->e_lru_list, &free_list); +- spin_lock(&mb_cache_spinlock); +- } +- spin_unlock(&mb_cache_spinlock); +- +- list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { +- __mb_cache_entry_forget(entry, gfp_mask); +- freed++; +- } +- return freed; +-} +- +-static unsigned long +-mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +-{ +- struct mb_cache *cache; +- unsigned long count = 0; +- +- spin_lock(&mb_cache_spinlock); +- list_for_each_entry(cache, &mb_cache_list, c_cache_list) { +- mb_debug("cache %s (%d)", cache->c_name, +- atomic_read(&cache->c_entry_count)); +- count += atomic_read(&cache->c_entry_count); +- } +- spin_unlock(&mb_cache_spinlock); +- +- return vfs_pressure_ratio(count); +-} +- +-static struct shrinker mb_cache_shrinker = { +- .count_objects = mb_cache_shrink_count, +- .scan_objects = mb_cache_shrink_scan, +- .seeks = DEFAULT_SEEKS, +-}; +- +-/* +- * mb_cache_create() create a new cache +- * +- * All entries in one cache are equal size. Cache entries may be from +- * multiple devices. If this is the first mbcache created, registers +- * the cache with kernel memory management. Returns NULL if no more +- * memory was available. +- * +- * @name: name of the cache (informal) +- * @bucket_bits: log2(number of hash buckets) +- */ +-struct mb_cache * +-mb_cache_create(const char *name, int bucket_bits) +-{ +- int n, bucket_count = 1 << bucket_bits; +- struct mb_cache *cache = NULL; +- +- if (!mb_cache_bg_lock) { +- mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), +- GFP_KERNEL); +- if (!mb_cache_bg_lock) +- return NULL; +- bgl_lock_init(mb_cache_bg_lock); +- } +- +- cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); +- if (!cache) +- return NULL; +- cache->c_name = name; +- atomic_set(&cache->c_entry_count, 0); +- cache->c_bucket_bits = bucket_bits; +- cache->c_block_hash = kmalloc(bucket_count * +- sizeof(struct hlist_bl_head), GFP_KERNEL); +- if (!cache->c_block_hash) +- goto fail; +- for (n=0; nc_block_hash[n]); +- cache->c_index_hash = kmalloc(bucket_count * +- sizeof(struct hlist_bl_head), GFP_KERNEL); +- if (!cache->c_index_hash) +- goto fail; +- for (n=0; nc_index_hash[n]); +- if (!mb_cache_kmem_cache) { +- mb_cache_kmem_cache = kmem_cache_create(name, +- sizeof(struct mb_cache_entry), 0, +- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); +- if (!mb_cache_kmem_cache) +- goto fail2; +- } +- cache->c_entry_cache = mb_cache_kmem_cache; +- +- /* +- * Set an upper limit on the number of cache entries so that the hash +- * chains won't grow too long. +- */ +- cache->c_max_entries = bucket_count << 4; +- +- spin_lock(&mb_cache_spinlock); +- list_add(&cache->c_cache_list, &mb_cache_list); +- spin_unlock(&mb_cache_spinlock); +- return cache; +- +-fail2: +- kfree(cache->c_index_hash); +- +-fail: +- kfree(cache->c_block_hash); +- kfree(cache); +- return NULL; +-} +- +- +-/* +- * mb_cache_shrink() +- * +- * Removes all cache entries of a device from the cache. All cache entries +- * currently in use cannot be freed, and thus remain in the cache. All others +- * are freed. +- * +- * @bdev: which device's cache entries to shrink +- */ +-void +-mb_cache_shrink(struct block_device *bdev) +-{ +- LIST_HEAD(free_list); +- struct list_head *l; +- struct mb_cache_entry *ce, *tmp; +- +- l = &mb_cache_lru_list; +- spin_lock(&mb_cache_spinlock); +- while (!list_is_last(l, &mb_cache_lru_list)) { +- l = l->next; +- ce = list_entry(l, struct mb_cache_entry, e_lru_list); +- if (ce->e_bdev == bdev) { +- list_del_init(&ce->e_lru_list); +- if (ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt)) +- continue; +- spin_unlock(&mb_cache_spinlock); +- /* +- * Prevent any find or get operation on the entry. +- */ +- hlist_bl_lock(ce->e_block_hash_p); +- hlist_bl_lock(ce->e_index_hash_p); +- /* Ignore if it is touched by a find/get */ +- if (ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt) || +- !list_empty(&ce->e_lru_list)) { +- hlist_bl_unlock(ce->e_index_hash_p); +- hlist_bl_unlock(ce->e_block_hash_p); +- l = &mb_cache_lru_list; +- spin_lock(&mb_cache_spinlock); +- continue; +- } +- __mb_cache_entry_unhash_unlock(ce); +- mb_assert(!(ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt))); +- list_add_tail(&ce->e_lru_list, &free_list); +- l = &mb_cache_lru_list; +- spin_lock(&mb_cache_spinlock); +- } +- } +- spin_unlock(&mb_cache_spinlock); +- +- list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { +- __mb_cache_entry_forget(ce, GFP_KERNEL); +- } +-} +- +- +-/* +- * mb_cache_destroy() +- * +- * Shrinks the cache to its minimum possible size (hopefully 0 entries), +- * and then destroys it. If this was the last mbcache, un-registers the +- * mbcache from kernel memory management. +- */ +-void +-mb_cache_destroy(struct mb_cache *cache) +-{ +- LIST_HEAD(free_list); +- struct mb_cache_entry *ce, *tmp; +- +- spin_lock(&mb_cache_spinlock); +- list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { +- if (ce->e_cache == cache) +- list_move_tail(&ce->e_lru_list, &free_list); +- } +- list_del(&cache->c_cache_list); +- spin_unlock(&mb_cache_spinlock); +- +- list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { +- list_del_init(&ce->e_lru_list); +- /* +- * Prevent any find or get operation on the entry. +- */ +- hlist_bl_lock(ce->e_block_hash_p); +- hlist_bl_lock(ce->e_index_hash_p); +- mb_assert(!(ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt))); +- __mb_cache_entry_unhash_unlock(ce); +- __mb_cache_entry_forget(ce, GFP_KERNEL); +- } +- +- if (atomic_read(&cache->c_entry_count) > 0) { +- mb_error("cache %s: %d orphaned entries", +- cache->c_name, +- atomic_read(&cache->c_entry_count)); +- } +- +- if (list_empty(&mb_cache_list)) { +- kmem_cache_destroy(mb_cache_kmem_cache); +- mb_cache_kmem_cache = NULL; +- } +- kfree(cache->c_index_hash); +- kfree(cache->c_block_hash); +- kfree(cache); +-} +- +-/* +- * mb_cache_entry_alloc() +- * +- * Allocates a new cache entry. The new entry will not be valid initially, +- * and thus cannot be looked up yet. It should be filled with data, and +- * then inserted into the cache using mb_cache_entry_insert(). Returns NULL +- * if no more memory was available. +- */ +-struct mb_cache_entry * +-mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) +-{ +- struct mb_cache_entry *ce; +- +- if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { +- struct list_head *l; +- +- l = &mb_cache_lru_list; +- spin_lock(&mb_cache_spinlock); +- while (!list_is_last(l, &mb_cache_lru_list)) { +- l = l->next; +- ce = list_entry(l, struct mb_cache_entry, e_lru_list); +- if (ce->e_cache == cache) { +- list_del_init(&ce->e_lru_list); +- if (ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt)) +- continue; +- spin_unlock(&mb_cache_spinlock); +- /* +- * Prevent any find or get operation on the +- * entry. +- */ +- hlist_bl_lock(ce->e_block_hash_p); +- hlist_bl_lock(ce->e_index_hash_p); +- /* Ignore if it is touched by a find/get */ +- if (ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt) || +- !list_empty(&ce->e_lru_list)) { +- hlist_bl_unlock(ce->e_index_hash_p); +- hlist_bl_unlock(ce->e_block_hash_p); +- l = &mb_cache_lru_list; +- spin_lock(&mb_cache_spinlock); +- continue; +- } +- mb_assert(list_empty(&ce->e_lru_list)); +- mb_assert(!(ce->e_used || ce->e_queued || +- atomic_read(&ce->e_refcnt))); +- __mb_cache_entry_unhash_unlock(ce); +- goto found; +- } +- } +- spin_unlock(&mb_cache_spinlock); +- } +- +- ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); +- if (!ce) +- return NULL; +- atomic_inc(&cache->c_entry_count); +- INIT_LIST_HEAD(&ce->e_lru_list); +- INIT_HLIST_BL_NODE(&ce->e_block_list); +- INIT_HLIST_BL_NODE(&ce->e_index.o_list); +- ce->e_cache = cache; +- ce->e_queued = 0; +- atomic_set(&ce->e_refcnt, 0); +-found: +- ce->e_block_hash_p = &cache->c_block_hash[0]; +- ce->e_index_hash_p = &cache->c_index_hash[0]; +- ce->e_used = 1 + MB_CACHE_WRITER; +- return ce; +-} +- +- +-/* +- * mb_cache_entry_insert() +- * +- * Inserts an entry that was allocated using mb_cache_entry_alloc() into +- * the cache. After this, the cache entry can be looked up, but is not yet +- * in the lru list as the caller still holds a handle to it. Returns 0 on +- * success, or -EBUSY if a cache entry for that device + inode exists +- * already (this may happen after a failed lookup, but when another process +- * has inserted the same cache entry in the meantime). +- * +- * @bdev: device the cache entry belongs to +- * @block: block number +- * @key: lookup key +- */ +-int +-mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, +- sector_t block, unsigned int key) +-{ +- struct mb_cache *cache = ce->e_cache; +- unsigned int bucket; +- struct hlist_bl_node *l; +- struct hlist_bl_head *block_hash_p; +- struct hlist_bl_head *index_hash_p; +- struct mb_cache_entry *lce; +- +- mb_assert(ce); +- bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), +- cache->c_bucket_bits); +- block_hash_p = &cache->c_block_hash[bucket]; +- hlist_bl_lock(block_hash_p); +- hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { +- if (lce->e_bdev == bdev && lce->e_block == block) { +- hlist_bl_unlock(block_hash_p); +- return -EBUSY; +- } +- } +- mb_assert(!__mb_cache_entry_is_block_hashed(ce)); +- __mb_cache_entry_unhash_block(ce); +- __mb_cache_entry_unhash_index(ce); +- ce->e_bdev = bdev; +- ce->e_block = block; +- ce->e_block_hash_p = block_hash_p; +- ce->e_index.o_key = key; +- hlist_bl_add_head(&ce->e_block_list, block_hash_p); +- hlist_bl_unlock(block_hash_p); +- bucket = hash_long(key, cache->c_bucket_bits); +- index_hash_p = &cache->c_index_hash[bucket]; +- hlist_bl_lock(index_hash_p); +- ce->e_index_hash_p = index_hash_p; +- hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); +- hlist_bl_unlock(index_hash_p); +- return 0; +-} +- +- +-/* +- * mb_cache_entry_release() +- * +- * Release a handle to a cache entry. When the last handle to a cache entry +- * is released it is either freed (if it is invalid) or otherwise inserted +- * in to the lru list. +- */ +-void +-mb_cache_entry_release(struct mb_cache_entry *ce) +-{ +- __mb_cache_entry_release(ce); +-} +- +- +-/* +- * mb_cache_entry_free() +- * +- */ +-void +-mb_cache_entry_free(struct mb_cache_entry *ce) +-{ +- mb_assert(ce); +- mb_assert(list_empty(&ce->e_lru_list)); +- hlist_bl_lock(ce->e_index_hash_p); +- __mb_cache_entry_unhash_index(ce); +- hlist_bl_unlock(ce->e_index_hash_p); +- hlist_bl_lock(ce->e_block_hash_p); +- __mb_cache_entry_unhash_block(ce); +- hlist_bl_unlock(ce->e_block_hash_p); +- __mb_cache_entry_release(ce); +-} +- +- +-/* +- * mb_cache_entry_get() +- * +- * Get a cache entry by device / block number. (There can only be one entry +- * in the cache per device and block.) Returns NULL if no such cache entry +- * exists. The returned cache entry is locked for exclusive access ("single +- * writer"). +- */ +-struct mb_cache_entry * +-mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, +- sector_t block) +-{ +- unsigned int bucket; +- struct hlist_bl_node *l; +- struct mb_cache_entry *ce; +- struct hlist_bl_head *block_hash_p; +- +- bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), +- cache->c_bucket_bits); +- block_hash_p = &cache->c_block_hash[bucket]; +- /* First serialize access to the block corresponding hash chain. */ +- hlist_bl_lock(block_hash_p); +- hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { +- mb_assert(ce->e_block_hash_p == block_hash_p); +- if (ce->e_bdev == bdev && ce->e_block == block) { +- /* +- * Prevent a free from removing the entry. +- */ +- atomic_inc(&ce->e_refcnt); +- hlist_bl_unlock(block_hash_p); +- __spin_lock_mb_cache_entry(ce); +- atomic_dec(&ce->e_refcnt); +- if (ce->e_used > 0) { +- DEFINE_WAIT(wait); +- while (ce->e_used > 0) { +- ce->e_queued++; +- prepare_to_wait(&mb_cache_queue, &wait, +- TASK_UNINTERRUPTIBLE); +- __spin_unlock_mb_cache_entry(ce); +- schedule(); +- __spin_lock_mb_cache_entry(ce); +- ce->e_queued--; +- } +- finish_wait(&mb_cache_queue, &wait); +- } +- ce->e_used += 1 + MB_CACHE_WRITER; +- __spin_unlock_mb_cache_entry(ce); +- +- if (!list_empty(&ce->e_lru_list)) { +- spin_lock(&mb_cache_spinlock); +- list_del_init(&ce->e_lru_list); +- spin_unlock(&mb_cache_spinlock); +- } +- if (!__mb_cache_entry_is_block_hashed(ce)) { +- __mb_cache_entry_release(ce); +- return NULL; +- } +- return ce; +- } +- } +- hlist_bl_unlock(block_hash_p); +- return NULL; +-} +- +-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) +- +-static struct mb_cache_entry * +-__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, +- struct block_device *bdev, unsigned int key) +-{ +- +- /* The index hash chain is alredy acquire by caller. */ +- while (l != NULL) { +- struct mb_cache_entry *ce = +- hlist_bl_entry(l, struct mb_cache_entry, +- e_index.o_list); +- mb_assert(ce->e_index_hash_p == head); +- if (ce->e_bdev == bdev && ce->e_index.o_key == key) { +- /* +- * Prevent a free from removing the entry. +- */ +- atomic_inc(&ce->e_refcnt); +- hlist_bl_unlock(head); +- __spin_lock_mb_cache_entry(ce); +- atomic_dec(&ce->e_refcnt); +- ce->e_used++; +- /* Incrementing before holding the lock gives readers +- priority over writers. */ +- if (ce->e_used >= MB_CACHE_WRITER) { +- DEFINE_WAIT(wait); +- +- while (ce->e_used >= MB_CACHE_WRITER) { +- ce->e_queued++; +- prepare_to_wait(&mb_cache_queue, &wait, +- TASK_UNINTERRUPTIBLE); +- __spin_unlock_mb_cache_entry(ce); +- schedule(); +- __spin_lock_mb_cache_entry(ce); +- ce->e_queued--; +- } +- finish_wait(&mb_cache_queue, &wait); +- } +- __spin_unlock_mb_cache_entry(ce); +- if (!list_empty(&ce->e_lru_list)) { +- spin_lock(&mb_cache_spinlock); +- list_del_init(&ce->e_lru_list); +- spin_unlock(&mb_cache_spinlock); +- } +- if (!__mb_cache_entry_is_block_hashed(ce)) { +- __mb_cache_entry_release(ce); +- return ERR_PTR(-EAGAIN); +- } +- return ce; +- } +- l = l->next; +- } +- hlist_bl_unlock(head); +- return NULL; +-} +- +- +-/* +- * mb_cache_entry_find_first() +- * +- * Find the first cache entry on a given device with a certain key in +- * an additional index. Additional matches can be found with +- * mb_cache_entry_find_next(). Returns NULL if no match was found. The +- * returned cache entry is locked for shared access ("multiple readers"). +- * +- * @cache: the cache to search +- * @bdev: the device the cache entry should belong to +- * @key: the key in the index +- */ +-struct mb_cache_entry * +-mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, +- unsigned int key) +-{ +- unsigned int bucket = hash_long(key, cache->c_bucket_bits); +- struct hlist_bl_node *l; +- struct mb_cache_entry *ce = NULL; +- struct hlist_bl_head *index_hash_p; +- +- index_hash_p = &cache->c_index_hash[bucket]; +- hlist_bl_lock(index_hash_p); +- if (!hlist_bl_empty(index_hash_p)) { +- l = hlist_bl_first(index_hash_p); +- ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); +- } else +- hlist_bl_unlock(index_hash_p); +- return ce; +-} +- +- +-/* +- * mb_cache_entry_find_next() +- * +- * Find the next cache entry on a given device with a certain key in an +- * additional index. Returns NULL if no match could be found. The previous +- * entry is atomatically released, so that mb_cache_entry_find_next() can +- * be called like this: +- * +- * entry = mb_cache_entry_find_first(); +- * while (entry) { +- * ... +- * entry = mb_cache_entry_find_next(entry, ...); +- * } +- * +- * @prev: The previous match +- * @bdev: the device the cache entry should belong to +- * @key: the key in the index +- */ +-struct mb_cache_entry * +-mb_cache_entry_find_next(struct mb_cache_entry *prev, +- struct block_device *bdev, unsigned int key) +-{ +- struct mb_cache *cache = prev->e_cache; +- unsigned int bucket = hash_long(key, cache->c_bucket_bits); +- struct hlist_bl_node *l; +- struct mb_cache_entry *ce; +- struct hlist_bl_head *index_hash_p; +- +- index_hash_p = &cache->c_index_hash[bucket]; +- mb_assert(prev->e_index_hash_p == index_hash_p); +- hlist_bl_lock(index_hash_p); +- mb_assert(!hlist_bl_empty(index_hash_p)); +- l = prev->e_index.o_list.next; +- ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); +- __mb_cache_entry_release(prev); +- return ce; +-} +- +-#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ +- +-static int __init init_mbcache(void) +-{ +- register_shrinker(&mb_cache_shrinker); +- return 0; +-} +- +-static void __exit exit_mbcache(void) +-{ +- unregister_shrinker(&mb_cache_shrinker); +-} +- +-module_init(init_mbcache) +-module_exit(exit_mbcache) +- +diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h +deleted file mode 100644 +index 6a392e7a723a..000000000000 +--- a/include/linux/mbcache.h ++++ /dev/null +@@ -1,55 +0,0 @@ +-/* +- File: linux/mbcache.h +- +- (C) 2001 by Andreas Gruenbacher, +-*/ +-struct mb_cache_entry { +- struct list_head e_lru_list; +- struct mb_cache *e_cache; +- unsigned short e_used; +- unsigned short e_queued; +- atomic_t e_refcnt; +- struct block_device *e_bdev; +- sector_t e_block; +- struct hlist_bl_node e_block_list; +- struct { +- struct hlist_bl_node o_list; +- unsigned int o_key; +- } e_index; +- struct hlist_bl_head *e_block_hash_p; +- struct hlist_bl_head *e_index_hash_p; +-}; +- +-struct mb_cache { +- struct list_head c_cache_list; +- const char *c_name; +- atomic_t c_entry_count; +- int c_max_entries; +- int c_bucket_bits; +- struct kmem_cache *c_entry_cache; +- struct hlist_bl_head *c_block_hash; +- struct hlist_bl_head *c_index_hash; +-}; +- +-/* Functions on caches */ +- +-struct mb_cache *mb_cache_create(const char *, int); +-void mb_cache_shrink(struct block_device *); +-void mb_cache_destroy(struct mb_cache *); +- +-/* Functions on cache entries */ +- +-struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t); +-int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *, +- sector_t, unsigned int); +-void mb_cache_entry_release(struct mb_cache_entry *); +-void mb_cache_entry_free(struct mb_cache_entry *); +-struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, +- struct block_device *, +- sector_t); +-struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, +- struct block_device *, +- unsigned int); +-struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, +- struct block_device *, +- unsigned int); +-- +2.1.4 + + diff --git a/series b/series index c17b05e6..331afd34 100644 --- a/series +++ b/series @@ -12,6 +12,15 @@ fix-memleak-in-ext4_readdir fix-bh-b_state-corruption fix-crashes-in-dioread_nolock-mode +# Above has been pushed to Linus + +reimplement-mbcache +ext4-convert-to-mbcache2 +ext2-convert-to-mbcache2 +remove-mbcache +mbcache2-limit-cache-size +mbcache2-use-referenced-bit-instead-of-LRU + ########################################## # unstable patches #################################################### diff --git a/timestamps b/timestamps index b2b84df8..da9417e4 100755 --- a/timestamps +++ b/timestamps @@ -38,7 +38,13 @@ touch -d @1455258180 remove-unused-parameter-newblock touch -d @1455600079 stable-boundary touch -d @1455859105 fix-bh-b_state-corruption touch -d @1455860001 fix-crashes-in-dioread_nolock-mode -touch -d @1455860008 status -touch -d @1455860037 series touch -d @1455908377 fix-memleak-in-ext4_readdir -touch -d @1455915067 timestamps +touch -d @1455915167 reimplement-mbcache +touch -d @1455915233 ext4-convert-to-mbcache2 +touch -d @1455915523 ext2-convert-to-mbcache2 +touch -d @1455915619 remove-mbcache +touch -d @1455915679 mbcache2-limit-cache-size +touch -d @1455915868 mbcache2-use-referenced-bit-instead-of-LRU +touch -d @1455915890 series +touch -d @1455915931 status +touch -d @1456033604 timestamps -- 2.11.4.GIT