From 86a1947e5ddd2088c81198e73af3c961a4bcec35 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 19 Feb 2016 15:53:08 -0500
Subject: [PATCH] Add Jan Kara's mbcache 2 patch set

---
 ext2-convert-to-mbcache2                   | 512 +++++++++++++++
 ext4-convert-to-mbcache2                   | 389 ++++++++++++
 mbcache2-limit-cache-size                  | 139 +++++
 mbcache2-use-referenced-bit-instead-of-LRU | 139 +++++
 reimplement-mbcache                        | 541 ++++++++++++++++
 remove-mbcache                             | 959 +++++++++++++++++++++++++++++
 series                                     |   9 +
 timestamps                                 |  12 +-
 8 files changed, 2697 insertions(+), 3 deletions(-)
 create mode 100644 ext2-convert-to-mbcache2
 create mode 100644 ext4-convert-to-mbcache2
 create mode 100644 mbcache2-limit-cache-size
 create mode 100644 mbcache2-use-referenced-bit-instead-of-LRU
 create mode 100644 reimplement-mbcache
 create mode 100644 remove-mbcache

diff --git a/ext2-convert-to-mbcache2 b/ext2-convert-to-mbcache2
new file mode 100644
index 00000000..00bc485d
--- /dev/null
+++ b/ext2-convert-to-mbcache2
@@ -0,0 +1,512 @@
+ext2: convert to mbcache2
+
+From: Jan Kara <jack@suse.cz>
+
+The conversion is generally straightforward. We convert filesystem from
+a global cache to per-fs one. Similarly to ext4 the tricky part is that
+xattr block corresponding to found mbcache entry can get freed before we
+get buffer lock for that block. So we have to check whether the entry is
+still valid after getting the buffer lock.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext2/ext2.h  |   3 ++
+ fs/ext2/super.c |  25 ++++++----
+ fs/ext2/xattr.c | 146 +++++++++++++++++++++++++++-----------------------------
+ fs/ext2/xattr.h |  21 ++------
+ 4 files changed, 95 insertions(+), 100 deletions(-)
+
+diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
+index 4c69c94cafd8..f98ce7e60a0f 100644
+--- a/fs/ext2/ext2.h
++++ b/fs/ext2/ext2.h
+@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
+ #define rsv_start rsv_window._rsv_start
+ #define rsv_end rsv_window._rsv_end
+ 
++struct mb2_cache;
++
+ /*
+  * second extended-fs super-block data in memory
+  */
+@@ -111,6 +113,7 @@ struct ext2_sb_info {
+ 	 * of the mount options.
+ 	 */
+ 	spinlock_t s_lock;
++	struct mb2_cache *s_mb_cache;
+ };
+ 
+ static inline spinlock_t *
+diff --git a/fs/ext2/super.c b/fs/ext2/super.c
+index 748d35afc902..111a31761ffa 100644
+--- a/fs/ext2/super.c
++++ b/fs/ext2/super.c
+@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
+ 
+ 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ 
+-	ext2_xattr_put_super(sb);
++	if (sbi->s_mb_cache) {
++		ext2_xattr_destroy_cache(sbi->s_mb_cache);
++		sbi->s_mb_cache = NULL;
++	}
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		struct ext2_super_block *es = sbi->s_es;
+ 
+@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
+ 		ext2_msg(sb, KERN_ERR, "error: insufficient memory");
+ 		goto failed_mount3;
+ 	}
++
++#ifdef CONFIG_EXT2_FS_XATTR
++	sbi->s_mb_cache = ext2_xattr_create_cache();
++	if (!sbi->s_mb_cache) {
++		ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
++		goto failed_mount3;
++	}
++#endif
+ 	/*
+ 	 * set up enough so that it can read an inode
+ 	 */
+@@ -1149,6 +1160,8 @@ cantfind_ext2:
+ 			sb->s_id);
+ 	goto failed_mount;
+ failed_mount3:
++	if (sbi->s_mb_cache)
++		ext2_xattr_destroy_cache(sbi->s_mb_cache);
+ 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ 	percpu_counter_destroy(&sbi->s_dirs_counter);
+@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
+ 
+ static int __init init_ext2_fs(void)
+ {
+-	int err = init_ext2_xattr();
+-	if (err)
+-		return err;
++	int err;
++
+ 	err = init_inodecache();
+ 	if (err)
+-		goto out1;
++		return err;
+         err = register_filesystem(&ext2_fs_type);
+ 	if (err)
+ 		goto out;
+ 	return 0;
+ out:
+ 	destroy_inodecache();
+-out1:
+-	exit_ext2_xattr();
+ 	return err;
+ }
+ 
+@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
+ {
+ 	unregister_filesystem(&ext2_fs_type);
+ 	destroy_inodecache();
+-	exit_ext2_xattr();
+ }
+ 
+ MODULE_AUTHOR("Remy Card and others");
+diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
+index fa70848afa8f..c7ab4cadcea0 100644
+--- a/fs/ext2/xattr.c
++++ b/fs/ext2/xattr.c
+@@ -56,7 +56,7 @@
+ #include <linux/buffer_head.h>
+ #include <linux/init.h>
+ #include <linux/slab.h>
+-#include <linux/mbcache.h>
++#include <linux/mbcache2.h>
+ #include <linux/quotaops.h>
+ #include <linux/rwsem.h>
+ #include <linux/security.h>
+@@ -92,14 +92,12 @@
+ static int ext2_xattr_set2(struct inode *, struct buffer_head *,
+ 			   struct ext2_xattr_header *);
+ 
+-static int ext2_xattr_cache_insert(struct buffer_head *);
++static int ext2_xattr_cache_insert(struct mb2_cache *, struct buffer_head *);
+ static struct buffer_head *ext2_xattr_cache_find(struct inode *,
+ 						 struct ext2_xattr_header *);
+ static void ext2_xattr_rehash(struct ext2_xattr_header *,
+ 			      struct ext2_xattr_entry *);
+ 
+-static struct mb_cache *ext2_xattr_cache;
+-
+ static const struct xattr_handler *ext2_xattr_handler_map[] = {
+ 	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
+ #ifdef CONFIG_EXT2_FS_POSIX_ACL
+@@ -154,6 +152,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
+ 	size_t name_len, size;
+ 	char *end;
+ 	int error;
++	struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ 
+ 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+ 		  name_index, name, buffer, (long)buffer_size);
+@@ -198,7 +197,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_get",
+ 			goto found;
+ 		entry = next;
+ 	}
+-	if (ext2_xattr_cache_insert(bh))
++	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ 		ea_idebug(inode, "cache insert failed");
+ 	error = -ENODATA;
+ 	goto cleanup;
+@@ -211,7 +210,7 @@ found:
+ 	    le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
+ 		goto bad_block;
+ 
+-	if (ext2_xattr_cache_insert(bh))
++	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ 		ea_idebug(inode, "cache insert failed");
+ 	if (buffer) {
+ 		error = -ERANGE;
+@@ -249,6 +248,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+ 	char *end;
+ 	size_t rest = buffer_size;
+ 	int error;
++	struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ 
+ 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+ 		  buffer, (long)buffer_size);
+@@ -283,7 +283,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
+ 			goto bad_block;
+ 		entry = next;
+ 	}
+-	if (ext2_xattr_cache_insert(bh))
++	if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
+ 		ea_idebug(inode, "cache insert failed");
+ 
+ 	/* list the attribute names */
+@@ -480,22 +480,23 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
+ 	/* Here we know that we can set the new attribute. */
+ 
+ 	if (header) {
+-		struct mb_cache_entry *ce;
+-
+ 		/* assert(header == HDR(bh)); */
+-		ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
+-					bh->b_blocknr);
+ 		lock_buffer(bh);
+ 		if (header->h_refcount == cpu_to_le32(1)) {
++			__u32 hash = le32_to_cpu(header->h_hash);
++
+ 			ea_bdebug(bh, "modifying in-place");
+-			if (ce)
+-				mb_cache_entry_free(ce);
++			/*
++			 * This must happen under buffer lock for
++			 * ext2_xattr_set2() to reliably detect modified block
++			 */
++			mb2_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
++						     hash, bh->b_blocknr);
++
+ 			/* keep the buffer locked while modifying it. */
+ 		} else {
+ 			int offset;
+ 
+-			if (ce)
+-				mb_cache_entry_release(ce);
+ 			unlock_buffer(bh);
+ 			ea_bdebug(bh, "cloning");
+ 			header = kmalloc(bh->b_size, GFP_KERNEL);
+@@ -623,6 +624,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+ 	struct super_block *sb = inode->i_sb;
+ 	struct buffer_head *new_bh = NULL;
+ 	int error;
++	struct mb2_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
+ 
+ 	if (header) {
+ 		new_bh = ext2_xattr_cache_find(inode, header);
+@@ -650,7 +652,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+ 			   don't need to change the reference count. */
+ 			new_bh = old_bh;
+ 			get_bh(new_bh);
+-			ext2_xattr_cache_insert(new_bh);
++			ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
+ 		} else {
+ 			/* We need to allocate a new block */
+ 			ext2_fsblk_t goal = ext2_group_first_block_no(sb,
+@@ -671,7 +673,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+ 			memcpy(new_bh->b_data, header, new_bh->b_size);
+ 			set_buffer_uptodate(new_bh);
+ 			unlock_buffer(new_bh);
+-			ext2_xattr_cache_insert(new_bh);
++			ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
+ 			
+ 			ext2_xattr_update_super_block(sb);
+ 		}
+@@ -704,19 +706,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+ 
+ 	error = 0;
+ 	if (old_bh && old_bh != new_bh) {
+-		struct mb_cache_entry *ce;
+-
+ 		/*
+ 		 * If there was an old block and we are no longer using it,
+ 		 * release the old block.
+ 		 */
+-		ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
+-					old_bh->b_blocknr);
+ 		lock_buffer(old_bh);
+ 		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
++			__u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
++
++			/*
++			 * This must happen under buffer lock for
++			 * ext2_xattr_set2() to reliably detect freed block
++			 */
++			mb2_cache_entry_delete_block(ext2_mb_cache,
++						     hash, old_bh->b_blocknr);
+ 			/* Free the old block. */
+-			if (ce)
+-				mb_cache_entry_free(ce);
+ 			ea_bdebug(old_bh, "freeing");
+ 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
+ 			mark_inode_dirty(inode);
+@@ -727,8 +731,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
+ 		} else {
+ 			/* Decrement the refcount only. */
+ 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
+-			if (ce)
+-				mb_cache_entry_release(ce);
+ 			dquot_free_block_nodirty(inode, 1);
+ 			mark_inode_dirty(inode);
+ 			mark_buffer_dirty(old_bh);
+@@ -754,7 +756,6 @@ void
+ ext2_xattr_delete_inode(struct inode *inode)
+ {
+ 	struct buffer_head *bh = NULL;
+-	struct mb_cache_entry *ce;
+ 
+ 	down_write(&EXT2_I(inode)->xattr_sem);
+ 	if (!EXT2_I(inode)->i_file_acl)
+@@ -774,19 +775,22 @@ ext2_xattr_delete_inode(struct inode *inode)
+ 			EXT2_I(inode)->i_file_acl);
+ 		goto cleanup;
+ 	}
+-	ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ 	lock_buffer(bh);
+ 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
+-		if (ce)
+-			mb_cache_entry_free(ce);
++		__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++
++		/*
++		 * This must happen under buffer lock for ext2_xattr_set2() to
++		 * reliably detect freed block
++		 */
++		mb2_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
++					     hash, bh->b_blocknr);
+ 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
+ 		get_bh(bh);
+ 		bforget(bh);
+ 		unlock_buffer(bh);
+ 	} else {
+ 		le32_add_cpu(&HDR(bh)->h_refcount, -1);
+-		if (ce)
+-			mb_cache_entry_release(ce);
+ 		ea_bdebug(bh, "refcount now=%d",
+ 			le32_to_cpu(HDR(bh)->h_refcount));
+ 		unlock_buffer(bh);
+@@ -803,18 +807,6 @@ cleanup:
+ }
+ 
+ /*
+- * ext2_xattr_put_super()
+- *
+- * This is called when a file system is unmounted.
+- */
+-void
+-ext2_xattr_put_super(struct super_block *sb)
+-{
+-	mb_cache_shrink(sb->s_bdev);
+-}
+-
+-
+-/*
+  * ext2_xattr_cache_insert()
+  *
+  * Create a new entry in the extended attribute cache, and insert
+@@ -823,27 +815,22 @@ ext2_xattr_put_super(struct super_block *sb)
+  * Returns 0, or a negative error number on failure.
+  */
+ static int
+-ext2_xattr_cache_insert(struct buffer_head *bh)
++ext2_xattr_cache_insert(struct mb2_cache *cache, struct buffer_head *bh)
+ {
+ 	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+-	struct mb_cache_entry *ce;
++	struct mb2_cache_entry *ce;
+ 	int error;
+ 
+-	ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
+-	if (!ce)
+-		return -ENOMEM;
+-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+-	if (error) {
+-		mb_cache_entry_free(ce);
+-		if (error == -EBUSY) {
++	ce = mb2_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr);
++	if (IS_ERR(ce)) {
++		if (PTR_ERR(ce) == -EBUSY) {
+ 			ea_bdebug(bh, "already in cache (%d cache entries)",
+ 				atomic_read(&ext2_xattr_cache->c_entry_count));
+ 			error = 0;
+ 		}
+ 	} else {
+-		ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
+-			  atomic_read(&ext2_xattr_cache->c_entry_count));
+-		mb_cache_entry_release(ce);
++		ea_bdebug(bh, "inserting [%x]", (int)hash);
++		mb2_cache_entry_put(cache, ce);
+ 	}
+ 	return error;
+ }
+@@ -900,23 +887,17 @@ static struct buffer_head *
+ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
+ {
+ 	__u32 hash = le32_to_cpu(header->h_hash);
+-	struct mb_cache_entry *ce;
++	struct mb2_cache_entry *ce;
++	struct mb2_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
+ 
+ 	if (!header->h_hash)
+ 		return NULL;  /* never share */
+ 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+ again:
+-	ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
+-				       hash);
++	ce = mb2_cache_entry_find_first(ext2_mb_cache, hash);
+ 	while (ce) {
+ 		struct buffer_head *bh;
+ 
+-		if (IS_ERR(ce)) {
+-			if (PTR_ERR(ce) == -EAGAIN)
+-				goto again;
+-			break;
+-		}
+-
+ 		bh = sb_bread(inode->i_sb, ce->e_block);
+ 		if (!bh) {
+ 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
+@@ -924,7 +905,21 @@ again:
+ 				inode->i_ino, (unsigned long) ce->e_block);
+ 		} else {
+ 			lock_buffer(bh);
+-			if (le32_to_cpu(HDR(bh)->h_refcount) >
++			/*
++			 * We have to be careful about races with freeing or
++			 * rehashing of xattr block. Once we hold buffer lock
++			 * xattr block's state is stable so we can check
++			 * whether the block got freed / rehashed or not.
++			 * Since we unhash mbcache entry under buffer lock when
++			 * freeing / rehashing xattr block, checking whether
++			 * entry is still hashed is reliable.
++			 */
++			if (hlist_bl_unhashed(&ce->e_hash_list)) {
++				mb2_cache_entry_put(ext2_mb_cache, ce);
++				unlock_buffer(bh);
++				brelse(bh);
++				goto again;
++			} else if (le32_to_cpu(HDR(bh)->h_refcount) >
+ 				   EXT2_XATTR_REFCOUNT_MAX) {
+ 				ea_idebug(inode, "block %ld refcount %d>%d",
+ 					  (unsigned long) ce->e_block,
+@@ -933,13 +928,14 @@ again:
+ 			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
+ 				ea_bdebug(bh, "b_count=%d",
+ 					  atomic_read(&(bh->b_count)));
+-				mb_cache_entry_release(ce);
++				mb2_cache_entry_touch(ext2_mb_cache, ce);
++				mb2_cache_entry_put(ext2_mb_cache, ce);
+ 				return bh;
+ 			}
+ 			unlock_buffer(bh);
+ 			brelse(bh);
+ 		}
+-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
++		ce = mb2_cache_entry_find_next(ext2_mb_cache, ce);
+ 	}
+ 	return NULL;
+ }
+@@ -1012,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
+ 
+ #undef BLOCK_HASH_SHIFT
+ 
+-int __init
+-init_ext2_xattr(void)
++#define HASH_BUCKET_BITS 10
++
++struct mb2_cache *ext2_xattr_create_cache(void)
+ {
+-	ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
+-	if (!ext2_xattr_cache)
+-		return -ENOMEM;
+-	return 0;
++	return mb2_cache_create(HASH_BUCKET_BITS);
+ }
+ 
+-void
+-exit_ext2_xattr(void)
++void ext2_xattr_destroy_cache(struct mb2_cache *cache)
+ {
+-	mb_cache_destroy(ext2_xattr_cache);
++	if (cache)
++		mb2_cache_destroy(cache);
+ }
+diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
+index 60edf298644e..6ea38aa9563a 100644
+--- a/fs/ext2/xattr.h
++++ b/fs/ext2/xattr.h
+@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
+ #define EXT2_XATTR_SIZE(size) \
+ 	(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
+ 
++struct mb2_cache;
++
+ # ifdef CONFIG_EXT2_FS_XATTR
+ 
+ extern const struct xattr_handler ext2_xattr_user_handler;
+@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
+ extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+ 
+ extern void ext2_xattr_delete_inode(struct inode *);
+-extern void ext2_xattr_put_super(struct super_block *);
+ 
+-extern int init_ext2_xattr(void);
+-extern void exit_ext2_xattr(void);
++extern struct mb2_cache *ext2_xattr_create_cache(void);
++extern void ext2_xattr_destroy_cache(struct mb2_cache *cache);
+ 
+ extern const struct xattr_handler *ext2_xattr_handlers[];
+ 
+@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
+ {
+ }
+ 
+-static inline void
+-ext2_xattr_put_super(struct super_block *sb)
+-{
+-}
+-
+-static inline int
+-init_ext2_xattr(void)
+-{
+-	return 0;
+-}
+-
+-static inline void
+-exit_ext2_xattr(void)
++static inline void ext2_xattr_destroy_cache(struct mb2_cache *cache)
+ {
+ }
+ 
+-- 
+2.1.4
+
+
diff --git a/ext4-convert-to-mbcache2 b/ext4-convert-to-mbcache2
new file mode 100644
index 00000000..88b8385f
--- /dev/null
+++ b/ext4-convert-to-mbcache2
@@ -0,0 +1,389 @@
+ext4: convert to mbcache2
+
+From: Jan Kara <jack@suse.cz>
+
+The conversion is generally straightforward. The only tricky part is
+that xattr block corresponding to found mbcache entry can get freed
+before we get buffer lock for that block. So we have to check whether
+the entry is still valid after getting buffer lock.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/ext4/ext4.h  |   2 +-
+ fs/ext4/super.c |   7 ++-
+ fs/ext4/xattr.c | 135 +++++++++++++++++++++++++++++---------------------------
+ fs/ext4/xattr.h |   5 +--
+ 4 files changed, 77 insertions(+), 72 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 750063f7a50c..068a3eaa41ac 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1371,7 +1371,7 @@ struct ext4_sb_info {
+ 	struct list_head s_es_list;	/* List of inodes with reclaimable extents */
+ 	long s_es_nr_inode;
+ 	struct ext4_es_stats s_es_stats;
+-	struct mb_cache *s_mb_cache;
++	struct mb2_cache *s_mb_cache;
+ 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
+ 
+ 	/* Ratelimit ext4 messages. */
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index c9ab67da6e5a..dbd5b9b9c99a 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -814,7 +814,6 @@ static void ext4_put_super(struct super_block *sb)
+ 	ext4_release_system_zone(sb);
+ 	ext4_mb_release(sb);
+ 	ext4_ext_release(sb);
+-	ext4_xattr_put_super(sb);
+ 
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		ext4_clear_feature_journal_needs_recovery(sb);
+@@ -3759,7 +3758,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ 
+ no_journal:
+ 	if (ext4_mballoc_ready) {
+-		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
++		sbi->s_mb_cache = ext4_xattr_create_cache();
+ 		if (!sbi->s_mb_cache) {
+ 			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ 			goto failed_mount_wq;
+@@ -3989,6 +3988,10 @@ failed_mount4:
+ 	if (EXT4_SB(sb)->rsv_conversion_wq)
+ 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+ failed_mount_wq:
++	if (sbi->s_mb_cache) {
++		ext4_xattr_destroy_cache(sbi->s_mb_cache);
++		sbi->s_mb_cache = NULL;
++	}
+ 	if (sbi->s_journal) {
+ 		jbd2_journal_destroy(sbi->s_journal);
+ 		sbi->s_journal = NULL;
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 6b6b3e751f8c..a80e5e2acadd 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -53,7 +53,7 @@
+ #include <linux/init.h>
+ #include <linux/fs.h>
+ #include <linux/slab.h>
+-#include <linux/mbcache.h>
++#include <linux/mbcache2.h>
+ #include <linux/quotaops.h>
+ #include "ext4_jbd2.h"
+ #include "ext4.h"
+@@ -80,10 +80,10 @@
+ # define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+ #endif
+ 
+-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
++static void ext4_xattr_cache_insert(struct mb2_cache *, struct buffer_head *);
+ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+ 						 struct ext4_xattr_header *,
+-						 struct mb_cache_entry **);
++						 struct mb2_cache_entry **);
+ static void ext4_xattr_rehash(struct ext4_xattr_header *,
+ 			      struct ext4_xattr_entry *);
+ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
+@@ -278,7 +278,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+ 	struct ext4_xattr_entry *entry;
+ 	size_t size;
+ 	int error;
+-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
++	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ 
+ 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+ 		  name_index, name, buffer, (long)buffer_size);
+@@ -425,7 +425,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+ 	struct inode *inode = d_inode(dentry);
+ 	struct buffer_head *bh = NULL;
+ 	int error;
+-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
++	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ 
+ 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+ 		  buffer, (long)buffer_size);
+@@ -542,11 +542,8 @@ static void
+ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ 			 struct buffer_head *bh)
+ {
+-	struct mb_cache_entry *ce = NULL;
+ 	int error = 0;
+-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ 
+-	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+ 	BUFFER_TRACE(bh, "get_write_access");
+ 	error = ext4_journal_get_write_access(handle, bh);
+ 	if (error)
+@@ -554,9 +551,15 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ 
+ 	lock_buffer(bh);
+ 	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
++		__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
++
+ 		ea_bdebug(bh, "refcount now=0; freeing");
+-		if (ce)
+-			mb_cache_entry_free(ce);
++		/*
++		 * This must happen under buffer lock for
++		 * ext4_xattr_block_set() to reliably detect freed block
++		 */
++		mb2_cache_entry_delete_block(EXT4_GET_MB_CACHE(inode), hash,
++					     bh->b_blocknr);
+ 		get_bh(bh);
+ 		unlock_buffer(bh);
+ 		ext4_free_blocks(handle, inode, bh, 0, 1,
+@@ -564,8 +567,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ 				 EXT4_FREE_BLOCKS_FORGET);
+ 	} else {
+ 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+-		if (ce)
+-			mb_cache_entry_release(ce);
+ 		/*
+ 		 * Beware of this ugliness: Releasing of xattr block references
+ 		 * from different inodes can race and so we have to protect
+@@ -778,17 +779,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ 	struct super_block *sb = inode->i_sb;
+ 	struct buffer_head *new_bh = NULL;
+ 	struct ext4_xattr_search *s = &bs->s;
+-	struct mb_cache_entry *ce = NULL;
++	struct mb2_cache_entry *ce = NULL;
+ 	int error = 0;
+-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
++	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ 
+ #define header(x) ((struct ext4_xattr_header *)(x))
+ 
+ 	if (i->value && i->value_len > sb->s_blocksize)
+ 		return -ENOSPC;
+ 	if (s->base) {
+-		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
+-					bs->bh->b_blocknr);
+ 		BUFFER_TRACE(bs->bh, "get_write_access");
+ 		error = ext4_journal_get_write_access(handle, bs->bh);
+ 		if (error)
+@@ -796,10 +795,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ 		lock_buffer(bs->bh);
+ 
+ 		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+-			if (ce) {
+-				mb_cache_entry_free(ce);
+-				ce = NULL;
+-			}
++			__u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
++
++			/*
++			 * This must happen under buffer lock for
++			 * ext4_xattr_block_set() to reliably detect modified
++			 * block
++			 */
++			mb2_cache_entry_delete_block(ext4_mb_cache, hash,
++						     bs->bh->b_blocknr);
+ 			ea_bdebug(bs->bh, "modifying in-place");
+ 			error = ext4_xattr_set_entry(i, s);
+ 			if (!error) {
+@@ -823,10 +827,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ 			int offset = (char *)s->here - bs->bh->b_data;
+ 
+ 			unlock_buffer(bs->bh);
+-			if (ce) {
+-				mb_cache_entry_release(ce);
+-				ce = NULL;
+-			}
+ 			ea_bdebug(bs->bh, "cloning");
+ 			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+ 			error = -ENOMEM;
+@@ -881,6 +881,31 @@ inserted:
+ 				if (error)
+ 					goto cleanup_dquot;
+ 				lock_buffer(new_bh);
++				/*
++				 * We have to be careful about races with
++				 * freeing or rehashing of xattr block. Once we
++				 * hold buffer lock xattr block's state is
++				 * stable so we can check whether the block got
++				 * freed / rehashed or not.  Since we unhash
++				 * mbcache entry under buffer lock when freeing
++				 * / rehashing xattr block, checking whether
++				 * entry is still hashed is reliable.
++				 */
++				if (hlist_bl_unhashed(&ce->e_hash_list)) {
++					/*
++					 * Undo everything and check mbcache
++					 * again.
++					 */
++					unlock_buffer(new_bh);
++					dquot_free_block(inode,
++							 EXT4_C2B(EXT4_SB(sb),
++								  1));
++					brelse(new_bh);
++					mb2_cache_entry_put(ext4_mb_cache, ce);
++					ce = NULL;
++					new_bh = NULL;
++					goto inserted;
++				}
+ 				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+ 				ea_bdebug(new_bh, "reusing; refcount now=%d",
+ 					le32_to_cpu(BHDR(new_bh)->h_refcount));
+@@ -891,7 +916,8 @@ inserted:
+ 				if (error)
+ 					goto cleanup_dquot;
+ 			}
+-			mb_cache_entry_release(ce);
++			mb2_cache_entry_touch(ext4_mb_cache, ce);
++			mb2_cache_entry_put(ext4_mb_cache, ce);
+ 			ce = NULL;
+ 		} else if (bs->bh && s->base == bs->bh->b_data) {
+ 			/* We were modifying this block in-place. */
+@@ -956,7 +982,7 @@ getblk_failed:
+ 
+ cleanup:
+ 	if (ce)
+-		mb_cache_entry_release(ce);
++		mb2_cache_entry_put(ext4_mb_cache, ce);
+ 	brelse(new_bh);
+ 	if (!(bs->bh && s->base == bs->bh->b_data))
+ 		kfree(s->base);
+@@ -1509,17 +1535,6 @@ cleanup:
+ }
+ 
+ /*
+- * ext4_xattr_put_super()
+- *
+- * This is called when a file system is unmounted.
+- */
+-void
+-ext4_xattr_put_super(struct super_block *sb)
+-{
+-	mb_cache_shrink(sb->s_bdev);
+-}
+-
+-/*
+  * ext4_xattr_cache_insert()
+  *
+  * Create a new entry in the extended attribute cache, and insert
+@@ -1528,27 +1543,22 @@ ext4_xattr_put_super(struct super_block *sb)
+  * Returns 0, or a negative error number on failure.
+  */
+ static void
+-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
++ext4_xattr_cache_insert(struct mb2_cache *ext4_mb_cache, struct buffer_head *bh)
+ {
+ 	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+-	struct mb_cache_entry *ce;
++	struct mb2_cache_entry *ce;
+ 	int error;
+ 
+-	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
+-	if (!ce) {
+-		ea_bdebug(bh, "out of memory");
+-		return;
+-	}
+-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+-	if (error) {
+-		mb_cache_entry_free(ce);
+-		if (error == -EBUSY) {
++	ce = mb2_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
++				    bh->b_blocknr);
++	if (IS_ERR(ce)) {
++		if (PTR_ERR(ce) == -EBUSY) {
+ 			ea_bdebug(bh, "already in cache");
+ 			error = 0;
+ 		}
+ 	} else {
+ 		ea_bdebug(bh, "inserting [%x]", (int)hash);
+-		mb_cache_entry_release(ce);
++		mb2_cache_entry_put(ext4_mb_cache, ce);
+ 	}
+ }
+ 
+@@ -1602,26 +1612,19 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
+  */
+ static struct buffer_head *
+ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+-		      struct mb_cache_entry **pce)
++		      struct mb2_cache_entry **pce)
+ {
+ 	__u32 hash = le32_to_cpu(header->h_hash);
+-	struct mb_cache_entry *ce;
+-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
++	struct mb2_cache_entry *ce;
++	struct mb2_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ 
+ 	if (!header->h_hash)
+ 		return NULL;  /* never share */
+ 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+-again:
+-	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
+-				       hash);
++	ce = mb2_cache_entry_find_first(ext4_mb_cache, hash);
+ 	while (ce) {
+ 		struct buffer_head *bh;
+ 
+-		if (IS_ERR(ce)) {
+-			if (PTR_ERR(ce) == -EAGAIN)
+-				goto again;
+-			break;
+-		}
+ 		bh = sb_bread(inode->i_sb, ce->e_block);
+ 		if (!bh) {
+ 			EXT4_ERROR_INODE(inode, "block %lu read error",
+@@ -1637,7 +1640,7 @@ again:
+ 			return bh;
+ 		}
+ 		brelse(bh);
+-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
++		ce = mb2_cache_entry_find_next(ext4_mb_cache, ce);
+ 	}
+ 	return NULL;
+ }
+@@ -1712,15 +1715,15 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+ 
+ #define	HASH_BUCKET_BITS	10
+ 
+-struct mb_cache *
+-ext4_xattr_create_cache(char *name)
++struct mb2_cache *
++ext4_xattr_create_cache(void)
+ {
+-	return mb_cache_create(name, HASH_BUCKET_BITS);
++	return mb2_cache_create(HASH_BUCKET_BITS);
+ }
+ 
+-void ext4_xattr_destroy_cache(struct mb_cache *cache)
++void ext4_xattr_destroy_cache(struct mb2_cache *cache)
+ {
+ 	if (cache)
+-		mb_cache_destroy(cache);
++		mb2_cache_destroy(cache);
+ }
+ 
+diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
+index ddc0957760ba..10b0f7323ed6 100644
+--- a/fs/ext4/xattr.h
++++ b/fs/ext4/xattr.h
+@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
+ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+ 
+ extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+-extern void ext4_xattr_put_super(struct super_block *);
+ 
+ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ 			    struct ext4_inode *raw_inode, handle_t *handle);
+@@ -124,8 +123,8 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ 				       struct ext4_xattr_info *i,
+ 				       struct ext4_xattr_ibody_find *is);
+ 
+-extern struct mb_cache *ext4_xattr_create_cache(char *name);
+-extern void ext4_xattr_destroy_cache(struct mb_cache *);
++extern struct mb2_cache *ext4_xattr_create_cache(void);
++extern void ext4_xattr_destroy_cache(struct mb2_cache *);
+ 
+ #ifdef CONFIG_EXT4_FS_SECURITY
+ extern int ext4_init_security(handle_t *handle, struct inode *inode,
+-- 
+2.1.4
+
+
diff --git a/mbcache2-limit-cache-size b/mbcache2-limit-cache-size
new file mode 100644
index 00000000..83c6f568
--- /dev/null
+++ b/mbcache2-limit-cache-size
@@ -0,0 +1,139 @@
+mbcache2: limit cache size
+
+From: Jan Kara <jack@suse.cz>
+
+So far number of entries in mbcache is limited only by the pressure from
+the shrinker. Since too many entries degrade the hash table and
+generally we expect that caching more entries has diminishing returns,
+limit number of entries the same way as in the old mbcache to 16 * hash
+table size.
+
+Once we exceed the desired maximum number of entries, we schedule a
+backround work to reclaim entries. If the background work cannot keep up
+and the number of entries exceeds two times the desired maximum, we
+reclaim some entries directly when allocating a new entry.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/mbcache2.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 45 insertions(+), 5 deletions(-)
+
+diff --git a/fs/mbcache2.c b/fs/mbcache2.c
+index 4ccf0752c6d1..fe9f6f6a2953 100644
+--- a/fs/mbcache2.c
++++ b/fs/mbcache2.c
+@@ -4,6 +4,7 @@
+ #include <linux/list_bl.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
++#include <linux/workqueue.h>
+ #include <linux/mbcache2.h>
+ 
+ /*
+@@ -21,16 +22,29 @@ struct mb2_cache {
+ 	struct hlist_bl_head	*c_hash;
+ 	/* log2 of hash table size */
+ 	int			c_bucket_bits;
++	/* Maximum entries in cache to avoid degrading hash too much */
++	int			c_max_entries;
+ 	/* Protects c_lru_list, c_entry_count */
+ 	spinlock_t		c_lru_list_lock;
+ 	struct list_head	c_lru_list;
+ 	/* Number of entries in cache */
+ 	unsigned long		c_entry_count;
+ 	struct shrinker		c_shrink;
++	/* Work for shrinking when the cache has too many entries */
++	struct work_struct	c_shrink_work;
+ };
+ 
+ static struct kmem_cache *mb2_entry_cache;
+ 
++static unsigned long mb2_cache_shrink(struct mb2_cache *cache,
++				      unsigned int nr_to_scan);
++
++/*
++ * Number of entries to reclaim synchronously when there are too many entries
++ * in cache
++ */
++#define SYNC_SHRINK_BATCH 64
++
+ /*
+  * mb2_cache_entry_create - create entry in cache
+  * @cache - cache where the entry should be created
+@@ -52,6 +66,13 @@ struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache,
+ 	struct hlist_bl_node *dup_node;
+ 	struct hlist_bl_head *head;
+ 
++	/* Schedule background reclaim if there are too many entries */
++	if (cache->c_entry_count >= cache->c_max_entries)
++		schedule_work(&cache->c_shrink_work);
++	/* Do some sync reclaim if background reclaim cannot keep up */
++	if (cache->c_entry_count >= 2*cache->c_max_entries)
++		mb2_cache_shrink(cache, SYNC_SHRINK_BATCH);
++
+ 	entry = kmem_cache_alloc(mb2_entry_cache, mask);
+ 	if (!entry)
+ 		return ERR_PTR(-ENOMEM);
+@@ -252,12 +273,9 @@ static unsigned long mb2_cache_count(struct shrinker *shrink,
+ }
+ 
+ /* Shrink number of entries in cache */
+-static unsigned long mb2_cache_scan(struct shrinker *shrink,
+-				    struct shrink_control *sc)
++static unsigned long mb2_cache_shrink(struct mb2_cache *cache,
++				      unsigned int nr_to_scan)
+ {
+-	int nr_to_scan = sc->nr_to_scan;
+-	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
+-					      c_shrink);
+ 	struct mb2_cache_entry *entry;
+ 	struct hlist_bl_head *head;
+ 	unsigned int shrunk = 0;
+@@ -290,6 +308,25 @@ static unsigned long mb2_cache_scan(struct shrinker *shrink,
+ 	return shrunk;
+ }
+ 
++static unsigned long mb2_cache_scan(struct shrinker *shrink,
++				    struct shrink_control *sc)
++{
++	int nr_to_scan = sc->nr_to_scan;
++	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
++					      c_shrink);
++	return mb2_cache_shrink(cache, nr_to_scan);
++}
++
++/* We shrink 1/X of the cache when we have too many entries in it */
++#define SHRINK_DIVISOR 16
++
++static void mb2_cache_shrink_worker(struct work_struct *work)
++{
++	struct mb2_cache *cache = container_of(work, struct mb2_cache,
++					       c_shrink_work);
++	mb2_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
++}
++
+ /*
+  * mb2_cache_create - create cache
+  * @bucket_bits: log2 of the hash table size
+@@ -309,6 +346,7 @@ struct mb2_cache *mb2_cache_create(int bucket_bits)
+ 	if (!cache)
+ 		goto err_out;
+ 	cache->c_bucket_bits = bucket_bits;
++	cache->c_max_entries = bucket_count << 4;
+ 	INIT_LIST_HEAD(&cache->c_lru_list);
+ 	spin_lock_init(&cache->c_lru_list_lock);
+ 	cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
+@@ -325,6 +363,8 @@ struct mb2_cache *mb2_cache_create(int bucket_bits)
+ 	cache->c_shrink.seeks = DEFAULT_SEEKS;
+ 	register_shrinker(&cache->c_shrink);
+ 
++	INIT_WORK(&cache->c_shrink_work, mb2_cache_shrink_worker);
++
+ 	return cache;
+ 
+ err_out:
+-- 
+2.1.4
+
+
diff --git a/mbcache2-use-referenced-bit-instead-of-LRU b/mbcache2-use-referenced-bit-instead-of-LRU
new file mode 100644
index 00000000..1711ffdc
--- /dev/null
+++ b/mbcache2-use-referenced-bit-instead-of-LRU
@@ -0,0 +1,139 @@
+mbcache2: use referenced bit instead of LRU
+
+From: Jan Kara <jack@suse.cz>
+
+Currently we maintain perfect LRU list by moving entry to the tail of
+the list when it gets used. However these operations on cache-global
+list are relatively expensive.
+
+In this patch we switch to lazy updates of LRU list. Whenever entry gets
+used, we set a referenced bit in it. When reclaiming entries, we give
+referenced entries another round in the LRU.
+
+In my testing this logic gives about 30% boost to workloads with mostly
+unique xattr blocks (e.g. xattr-bench with 10 files and 10000 unique
+xattr values).
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/mbcache2.c            | 41 +++++++++++++++++++++++++++++++++--------
+ include/linux/mbcache2.h |  7 +++++--
+ 2 files changed, 38 insertions(+), 10 deletions(-)
+
+diff --git a/fs/mbcache2.c b/fs/mbcache2.c
+index fe9f6f6a2953..60310a690f8d 100644
+--- a/fs/mbcache2.c
++++ b/fs/mbcache2.c
+@@ -39,6 +39,29 @@ static struct kmem_cache *mb2_entry_cache;
+ static unsigned long mb2_cache_shrink(struct mb2_cache *cache,
+ 				      unsigned int nr_to_scan);
+ 
++static inline bool mb2_cache_entry_referenced(struct mb2_cache_entry *entry)
++{
++	return entry->_e_hash_list_head & 1;
++}
++
++static inline void mb2_cache_entry_set_referenced(struct mb2_cache_entry *entry)
++{
++	entry->_e_hash_list_head |= 1;
++}
++
++static inline void mb2_cache_entry_clear_referenced(
++					struct mb2_cache_entry *entry)
++{
++	entry->_e_hash_list_head &= ~1;
++}
++
++static inline struct hlist_bl_head *mb2_cache_entry_head(
++					struct mb2_cache_entry *entry)
++{
++	return (struct hlist_bl_head *)
++			(entry->_e_hash_list_head & ~1);
++}
++
+ /*
+  * Number of entries to reclaim synchronously when there are too many entries
+  * in cache
+@@ -83,7 +106,7 @@ struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache,
+ 	entry->e_key = key;
+ 	entry->e_block = block;
+ 	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+-	entry->e_hash_list_head = head;
++	entry->_e_hash_list_head = (unsigned long)head;
+ 	hlist_bl_lock(head);
+ 	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+ 		if (dup->e_key == key && dup->e_block == block) {
+@@ -125,7 +148,7 @@ EXPORT_SYMBOL(__mb2_cache_entry_free);
+ void mb2_cache_entry_delete(struct mb2_cache *cache,
+ 			    struct mb2_cache_entry *entry)
+ {
+-	struct hlist_bl_head *head = entry->e_hash_list_head;
++	struct hlist_bl_head *head = mb2_cache_entry_head(entry);
+ 
+ 	hlist_bl_lock(head);
+ 	if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+@@ -153,7 +176,7 @@ static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache,
+ 	struct hlist_bl_head *head;
+ 
+ 	if (entry)
+-		head = entry->e_hash_list_head;
++		head = mb2_cache_entry_head(entry);
+ 	else
+ 		head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+ 	hlist_bl_lock(head);
+@@ -256,10 +279,7 @@ EXPORT_SYMBOL(mb2_cache_entry_delete_block);
+ void mb2_cache_entry_touch(struct mb2_cache *cache,
+ 			   struct mb2_cache_entry *entry)
+ {
+-	spin_lock(&cache->c_lru_list_lock);
+-	if (!list_empty(&entry->e_lru_list))
+-		list_move_tail(&cache->c_lru_list, &entry->e_lru_list);
+-	spin_unlock(&cache->c_lru_list_lock);
++	mb2_cache_entry_set_referenced(entry);
+ }
+ EXPORT_SYMBOL(mb2_cache_entry_touch);
+ 
+@@ -284,6 +304,11 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache,
+ 	while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) {
+ 		entry = list_first_entry(&cache->c_lru_list,
+ 					 struct mb2_cache_entry, e_lru_list);
++		if (mb2_cache_entry_referenced(entry)) {
++			mb2_cache_entry_clear_referenced(entry);
++			list_move_tail(&cache->c_lru_list, &entry->e_lru_list);
++			continue;
++		}
+ 		list_del_init(&entry->e_lru_list);
+ 		cache->c_entry_count--;
+ 		/*
+@@ -291,7 +316,7 @@ static unsigned long mb2_cache_shrink(struct mb2_cache *cache,
+ 		 * from under us.
+ 		 */
+ 		spin_unlock(&cache->c_lru_list_lock);
+-		head = entry->e_hash_list_head;
++		head = mb2_cache_entry_head(entry);
+ 		hlist_bl_lock(head);
+ 		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+ 			hlist_bl_del_init(&entry->e_hash_list);
+diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h
+index 2a58c51c3a0a..ca5b509c14a8 100644
+--- a/include/linux/mbcache2.h
++++ b/include/linux/mbcache2.h
+@@ -19,8 +19,11 @@ struct mb2_cache_entry {
+ 	unsigned int		e_key;
+ 	/* Block number of hashed block - stable during lifetime of the entry */
+ 	sector_t		e_block;
+-	/* Head of hash list (for list bit lock) - stable */
+-	struct hlist_bl_head	*e_hash_list_head;
++	/*
++	 * Head of hash list (for list bit lock) - stable. Combined with
++	 * referenced bit of entry
++	 */
++	unsigned long		_e_hash_list_head;
+ };
+ 
+ struct mb2_cache *mb2_cache_create(int bucket_bits);
+-- 
+2.1.4
+
+
diff --git a/reimplement-mbcache b/reimplement-mbcache
new file mode 100644
index 00000000..145ab695
--- /dev/null
+++ b/reimplement-mbcache
@@ -0,0 +1,541 @@
+mbcache2: reimplement mbcache
+
+From: Jan Kara <jack@suse.cz>
+
+Original mbcache was designed to have more features than what ext?
+filesystems ended up using. It supported entry being in more hashes, it
+had a home-grown rwlocking of each entry, and one cache could cache
+entries from multiple filesystems. This genericity also resulted in more
+complex locking, larger cache entries, and generally more code
+complexity.
+
+This is reimplementation of the mbcache functionality to exactly fit the
+purpose ext? filesystems use it for. Cache entries are now considerably
+smaller (7 instead of 13 longs), the code is considerably smaller as
+well (432 vs 913 lines of code), and IMO also simpler. The new code is
+also much more lightweight.
+
+I have measured the speed using artificial xattr-bench benchmark, which
+spawns P processes, each process sets xattr for F different files, and
+the value of xattr is randomly chosen from a pool of V values. Averages
+of runtimes for 5 runs for various combinations of parameters are below.
+The first value in each cell is old mbache, the second value is the new
+mbcache.
+
+V=10
+F\P	1		2		4		8		16		32		64
+10	0.158,0.157	0.208,0.196	0.500,0.277	0.798,0.400	3.258,0.584	13.807,1.047	61.339,2.803
+100	0.172,0.167	0.279,0.222	0.520,0.275	0.825,0.341	2.981,0.505	12.022,1.202	44.641,2.943
+1000	0.185,0.174	0.297,0.239	0.445,0.283	0.767,0.340	2.329,0.480	6.342,1.198	16.440,3.888
+
+V=100
+F\P	1		2		4		8		16		32		64
+10	0.162,0.153	0.200,0.186	0.362,0.257	0.671,0.496	1.433,0.943	3.801,1.345	7.938,2.501
+100	0.153,0.160	0.221,0.199	0.404,0.264	0.945,0.379	1.556,0.485	3.761,1.156	7.901,2.484
+1000	0.215,0.191	0.303,0.246	0.471,0.288	0.960,0.347	1.647,0.479	3.916,1.176	8.058,3.160
+
+V=1000
+F\P	1		2		4		8		16		32		64
+10	0.151,0.129	0.210,0.163	0.326,0.245	0.685,0.521	1.284,0.859	3.087,2.251	6.451,4.801
+100	0.154,0.153	0.211,0.191	0.276,0.282	0.687,0.506	1.202,0.877	3.259,1.954	8.738,2.887
+1000	0.145,0.179	0.202,0.222	0.449,0.319	0.899,0.333	1.577,0.524	4.221,1.240	9.782,3.579
+
+V=10000
+F\P	1		2		4		8		16		32		64
+10	0.161,0.154	0.198,0.190	0.296,0.256	0.662,0.480	1.192,0.818	2.989,2.200	6.362,4.746
+100	0.176,0.174	0.236,0.203	0.326,0.255	0.696,0.511	1.183,0.855	4.205,3.444	19.510,17.760
+1000	0.199,0.183	0.240,0.227	1.159,1.014	2.286,2.154	6.023,6.039	10.933,---	36.620,---
+
+V=100000
+F\P	1		2		4		8		16		32		64
+10	0.171,0.162	0.204,0.198	0.285,0.230	0.692,0.500	1.225,0.881	2.990,2.243	6.379,4.771
+100	0.151,0.171	0.220,0.210	0.295,0.255	0.720,0.518	1.226,0.844	3.423,2.831	19.234,17.544
+1000	0.192,0.189	0.249,0.225	1.162,1.043	2.257,2.093	5.853,4.997	10.399,---	32.198,---
+
+We see that the new code is faster in pretty much all the cases and
+starting from 4 processes there are significant gains with the new code
+resulting in upto 20-times shorter runtimes. Also for large numbers of
+cached entries all values for the old code could not be measured as the
+kernel started hitting softlockups and died before the test completed.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/Makefile              |   2 +-
+ fs/mbcache2.c            | 388 +++++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/mbcache2.h |  54 +++++++
+ 3 files changed, 443 insertions(+), 1 deletion(-)
+ create mode 100644 fs/mbcache2.c
+ create mode 100644 include/linux/mbcache2.h
+
+diff --git a/fs/Makefile b/fs/Makefile
+index 79f522575cba..15b3d6c4e46a 100644
+--- a/fs/Makefile
++++ b/fs/Makefile
+@@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
+ obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
+ obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
+ 
+-obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
++obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o mbcache2.o
+ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
+ obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
+ obj-$(CONFIG_COREDUMP)		+= coredump.o
+diff --git a/fs/mbcache2.c b/fs/mbcache2.c
+new file mode 100644
+index 000000000000..4ccf0752c6d1
+--- /dev/null
++++ b/fs/mbcache2.c
+@@ -0,0 +1,388 @@
++#include <linux/spinlock.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/list_bl.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/mbcache2.h>
++
++/*
++ * Mbcache is a simple key-value store. Keys need not be unique, however
++ * key-value pairs are expected to be unique (we use this in
++ * mb2_cache_entry_delete_block()).
++ *
++ * We provide functions for creation and removal of entries, search by key,
++ * and a special "delete entry with given key-value pair" operation. Fixed
++ * size hash table is used for fast key lookups.
++ */
++
++struct mb2_cache {
++	/* Hash table of entries */
++	struct hlist_bl_head	*c_hash;
++	/* log2 of hash table size */
++	int			c_bucket_bits;
++	/* Protects c_lru_list, c_entry_count */
++	spinlock_t		c_lru_list_lock;
++	struct list_head	c_lru_list;
++	/* Number of entries in cache */
++	unsigned long		c_entry_count;
++	struct shrinker		c_shrink;
++};
++
++static struct kmem_cache *mb2_entry_cache;
++
++/*
++ * mb2_cache_entry_create - create entry in cache
++ * @cache - cache where the entry should be created
++ * @mask - gfp mask with which the entry should be allocated
++ * @key - key of the entry
++ * @block - block that contains data
++ *
++ * Creates entry in @cache with key @key and records that data is stored in
++ * block @block. The function returns -EBUSY if entry with the same key
++ * and for the same block already exists in cache. Otherwise reference to
++ * the created entry is returned.
++ */
++struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache,
++					       gfp_t mask,
++					       unsigned int key,
++					       sector_t block)
++{
++	struct mb2_cache_entry *entry, *dup;
++	struct hlist_bl_node *dup_node;
++	struct hlist_bl_head *head;
++
++	entry = kmem_cache_alloc(mb2_entry_cache, mask);
++	if (!entry)
++		return ERR_PTR(-ENOMEM);
++
++	INIT_LIST_HEAD(&entry->e_lru_list);
++	/* One ref for hash, one ref returned */
++	atomic_set(&entry->e_refcnt, 2);
++	entry->e_key = key;
++	entry->e_block = block;
++	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
++	entry->e_hash_list_head = head;
++	hlist_bl_lock(head);
++	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
++		if (dup->e_key == key && dup->e_block == block) {
++			hlist_bl_unlock(head);
++			kmem_cache_free(mb2_entry_cache, entry);
++			return ERR_PTR(-EBUSY);
++		}
++	}
++	hlist_bl_add_head(&entry->e_hash_list, head);
++	hlist_bl_unlock(head);
++
++	spin_lock(&cache->c_lru_list_lock);
++	list_add_tail(&entry->e_lru_list, &cache->c_lru_list);
++	/* Grab ref for LRU list */
++	atomic_inc(&entry->e_refcnt);
++	cache->c_entry_count++;
++	spin_unlock(&cache->c_lru_list_lock);
++
++	return entry;
++}
++EXPORT_SYMBOL(mb2_cache_entry_create);
++
++void __mb2_cache_entry_free(struct mb2_cache_entry *entry)
++{
++	kmem_cache_free(mb2_entry_cache, entry);
++}
++EXPORT_SYMBOL(__mb2_cache_entry_free);
++
++/*
++ * mb2_cache_entry_delete - delete entry from cache
++ * @cache - cache where the entry is
++ * @entry - entry to delete
++ *
++ * Delete entry from cache. The entry is unhashed and deleted from the lru list
++ * so it cannot be found. We also drop the reference to @entry caller gave us.
++ * However entry need not be freed if there's someone else still holding a
++ * reference to it. Freeing happens when the last reference is dropped.
++ */
++void mb2_cache_entry_delete(struct mb2_cache *cache,
++			    struct mb2_cache_entry *entry)
++{
++	struct hlist_bl_head *head = entry->e_hash_list_head;
++
++	hlist_bl_lock(head);
++	if (!hlist_bl_unhashed(&entry->e_hash_list)) {
++		hlist_bl_del_init(&entry->e_hash_list);
++		atomic_dec(&entry->e_refcnt);
++	}
++	hlist_bl_unlock(head);
++	spin_lock(&cache->c_lru_list_lock);
++	if (!list_empty(&entry->e_lru_list)) {
++		list_del_init(&entry->e_lru_list);
++		cache->c_entry_count--;
++		atomic_dec(&entry->e_refcnt);
++	}
++	spin_unlock(&cache->c_lru_list_lock);
++	mb2_cache_entry_put(cache, entry);
++}
++EXPORT_SYMBOL(mb2_cache_entry_delete);
++
++static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache,
++					    struct mb2_cache_entry *entry,
++					    unsigned int key)
++{
++	struct mb2_cache_entry *old_entry = entry;
++	struct hlist_bl_node *node;
++	struct hlist_bl_head *head;
++
++	if (entry)
++		head = entry->e_hash_list_head;
++	else
++		head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
++	hlist_bl_lock(head);
++	if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
++		node = entry->e_hash_list.next;
++	else
++		node = hlist_bl_first(head);
++	while (node) {
++		entry = hlist_bl_entry(node, struct mb2_cache_entry,
++				       e_hash_list);
++		if (entry->e_key == key) {
++			atomic_inc(&entry->e_refcnt);
++			goto out;
++		}
++		node = node->next;
++	}
++	entry = NULL;
++out:
++	hlist_bl_unlock(head);
++	if (old_entry)
++		mb2_cache_entry_put(cache, old_entry);
++
++	return entry;
++}
++
++/*
++ * mb2_cache_entry_find_first - find the first entry in cache with given key
++ * @cache: cache where we should search
++ * @key: key to look for
++ *
++ * Search in @cache for entry with key @key. Grabs reference to the first
++ * entry found and returns the entry.
++ */
++struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache,
++						   unsigned int key)
++{
++	return __entry_find(cache, NULL, key);
++}
++EXPORT_SYMBOL(mb2_cache_entry_find_first);
++
++/*
++ * mb2_cache_entry_find_next - find next entry in cache with the same
++ * @cache: cache where we should search
++ * @entry: entry to start search from
++ *
++ * Finds next entry in the hash chain which has the same key as @entry.
++ * If @entry is unhashed (which can happen when deletion of entry races
++ * with the search), finds the first entry in the hash chain. The function
++ * drops reference to @entry and returns with a reference to the found entry.
++ */
++struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache,
++						  struct mb2_cache_entry *entry)
++{
++	return __entry_find(cache, entry, entry->e_key);
++}
++EXPORT_SYMBOL(mb2_cache_entry_find_next);
++
++/* mb2_cache_entry_delete_block - remove information about block from cache
++ * @cache - cache we work with
++ * @key - key of the entry to remove
++ * @block - block containing data for @key
++ *
++ * Remove entry from cache @cache with key @key with data stored in @block.
++ */
++void mb2_cache_entry_delete_block(struct mb2_cache *cache, unsigned int key,
++				  sector_t block)
++{
++	struct hlist_bl_node *node;
++	struct hlist_bl_head *head;
++	struct mb2_cache_entry *entry;
++
++	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
++	hlist_bl_lock(head);
++	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
++		if (entry->e_key == key && entry->e_block == block) {
++			/* We keep hash list reference to keep entry alive */
++			hlist_bl_del_init(&entry->e_hash_list);
++			hlist_bl_unlock(head);
++			spin_lock(&cache->c_lru_list_lock);
++			if (!list_empty(&entry->e_lru_list)) {
++				list_del_init(&entry->e_lru_list);
++				cache->c_entry_count--;
++				atomic_dec(&entry->e_refcnt);
++			}
++			spin_unlock(&cache->c_lru_list_lock);
++			mb2_cache_entry_put(cache, entry);
++			return;
++		}
++	}
++	hlist_bl_unlock(head);
++}
++EXPORT_SYMBOL(mb2_cache_entry_delete_block);
++
++/* mb2_cache_entry_touch - cache entry got used
++ * @cache - cache the entry belongs to
++ * @entry - entry that got used
++ *
++ * Move entry in lru list to reflect the fact that it was used.
++ */
++void mb2_cache_entry_touch(struct mb2_cache *cache,
++			   struct mb2_cache_entry *entry)
++{
++	spin_lock(&cache->c_lru_list_lock);
++	if (!list_empty(&entry->e_lru_list))
++		list_move_tail(&cache->c_lru_list, &entry->e_lru_list);
++	spin_unlock(&cache->c_lru_list_lock);
++}
++EXPORT_SYMBOL(mb2_cache_entry_touch);
++
++static unsigned long mb2_cache_count(struct shrinker *shrink,
++				     struct shrink_control *sc)
++{
++	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
++					       c_shrink);
++
++	return cache->c_entry_count;
++}
++
++/* Shrink number of entries in cache */
++static unsigned long mb2_cache_scan(struct shrinker *shrink,
++				    struct shrink_control *sc)
++{
++	int nr_to_scan = sc->nr_to_scan;
++	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
++					      c_shrink);
++	struct mb2_cache_entry *entry;
++	struct hlist_bl_head *head;
++	unsigned int shrunk = 0;
++
++	spin_lock(&cache->c_lru_list_lock);
++	while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) {
++		entry = list_first_entry(&cache->c_lru_list,
++					 struct mb2_cache_entry, e_lru_list);
++		list_del_init(&entry->e_lru_list);
++		cache->c_entry_count--;
++		/*
++		 * We keep LRU list reference so that entry doesn't go away
++		 * from under us.
++		 */
++		spin_unlock(&cache->c_lru_list_lock);
++		head = entry->e_hash_list_head;
++		hlist_bl_lock(head);
++		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
++			hlist_bl_del_init(&entry->e_hash_list);
++			atomic_dec(&entry->e_refcnt);
++		}
++		hlist_bl_unlock(head);
++		if (mb2_cache_entry_put(cache, entry))
++			shrunk++;
++		cond_resched();
++		spin_lock(&cache->c_lru_list_lock);
++	}
++	spin_unlock(&cache->c_lru_list_lock);
++
++	return shrunk;
++}
++
++/*
++ * mb2_cache_create - create cache
++ * @bucket_bits: log2 of the hash table size
++ *
++ * Create cache for keys with 2^bucket_bits hash entries.
++ */
++struct mb2_cache *mb2_cache_create(int bucket_bits)
++{
++	struct mb2_cache *cache;
++	int bucket_count = 1 << bucket_bits;
++	int i;
++
++	if (!try_module_get(THIS_MODULE))
++		return NULL;
++
++	cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL);
++	if (!cache)
++		goto err_out;
++	cache->c_bucket_bits = bucket_bits;
++	INIT_LIST_HEAD(&cache->c_lru_list);
++	spin_lock_init(&cache->c_lru_list_lock);
++	cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
++				GFP_KERNEL);
++	if (!cache->c_hash) {
++		kfree(cache);
++		goto err_out;
++	}
++	for (i = 0; i < bucket_count; i++)
++		INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
++
++	cache->c_shrink.count_objects = mb2_cache_count;
++	cache->c_shrink.scan_objects = mb2_cache_scan;
++	cache->c_shrink.seeks = DEFAULT_SEEKS;
++	register_shrinker(&cache->c_shrink);
++
++	return cache;
++
++err_out:
++	module_put(THIS_MODULE);
++	return NULL;
++}
++EXPORT_SYMBOL(mb2_cache_create);
++
++/*
++ * mb2_cache_destroy - destroy cache
++ * @cache: the cache to destroy
++ *
++ * Free all entries in cache and cache itself. Caller must make sure nobody
++ * (except shrinker) can reach @cache when calling this.
++ */
++void mb2_cache_destroy(struct mb2_cache *cache)
++{
++	struct mb2_cache_entry *entry, *next;
++
++	unregister_shrinker(&cache->c_shrink);
++
++	/*
++	 * We don't bother with any locking. Cache must not be used at this
++	 * point.
++	 */
++	list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) {
++		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
++			hlist_bl_del_init(&entry->e_hash_list);
++			atomic_dec(&entry->e_refcnt);
++		} else
++			WARN_ON(1);
++		list_del(&entry->e_lru_list);
++		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
++		mb2_cache_entry_put(cache, entry);
++	}
++	kfree(cache->c_hash);
++	kfree(cache);
++	module_put(THIS_MODULE);
++}
++EXPORT_SYMBOL(mb2_cache_destroy);
++
++static int __init mb2cache_init(void)
++{
++	mb2_entry_cache = kmem_cache_create("mbcache",
++				sizeof(struct mb2_cache_entry), 0,
++				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
++	BUG_ON(!mb2_entry_cache);
++	return 0;
++}
++
++static void __exit mb2cache_exit(void)
++{
++	kmem_cache_destroy(mb2_entry_cache);
++}
++
++module_init(mb2cache_init)
++module_exit(mb2cache_exit)
++
++MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
++MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
++MODULE_LICENSE("GPL");
+diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h
+new file mode 100644
+index 000000000000..2a58c51c3a0a
+--- /dev/null
++++ b/include/linux/mbcache2.h
+@@ -0,0 +1,54 @@
++#ifndef _LINUX_MB2CACHE_H
++#define _LINUX_MB2CACHE_H
++
++#include <linux/hash.h>
++#include <linux/list_bl.h>
++#include <linux/list.h>
++#include <linux/atomic.h>
++#include <linux/fs.h>
++
++struct mb2_cache;
++
++struct mb2_cache_entry {
++	/* LRU list - protected by cache->c_lru_list_lock */
++	struct list_head	e_lru_list;
++	/* Hash table list - protected by bitlock in e_hash_list_head */
++	struct hlist_bl_node	e_hash_list;
++	atomic_t		e_refcnt;
++	/* Key in hash - stable during lifetime of the entry */
++	unsigned int		e_key;
++	/* Block number of hashed block - stable during lifetime of the entry */
++	sector_t		e_block;
++	/* Head of hash list (for list bit lock) - stable */
++	struct hlist_bl_head	*e_hash_list_head;
++};
++
++struct mb2_cache *mb2_cache_create(int bucket_bits);
++void mb2_cache_destroy(struct mb2_cache *cache);
++
++struct mb2_cache_entry *mb2_cache_entry_create(struct mb2_cache *cache,
++					       gfp_t mask,
++					       unsigned int key,
++					       sector_t block);
++void mb2_cache_entry_delete(struct mb2_cache *cache,
++			   struct mb2_cache_entry *entry);
++void __mb2_cache_entry_free(struct mb2_cache_entry *entry);
++static inline int mb2_cache_entry_put(struct mb2_cache *cache,
++				      struct mb2_cache_entry *entry)
++{
++	if (!atomic_dec_and_test(&entry->e_refcnt))
++		return 0;
++	__mb2_cache_entry_free(entry);
++	return 1;
++}
++
++void mb2_cache_entry_delete_block(struct mb2_cache *cache, unsigned int key,
++				  sector_t block);
++struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache,
++						   unsigned int key);
++struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache,
++						  struct mb2_cache_entry *entry);
++void mb2_cache_entry_touch(struct mb2_cache *cache,
++			   struct mb2_cache_entry *entry);
++
++#endif	/* _LINUX_MB2CACHE_H */
+-- 
+2.1.4
+
+
diff --git a/remove-mbcache b/remove-mbcache
new file mode 100644
index 00000000..18cbd236
--- /dev/null
+++ b/remove-mbcache
@@ -0,0 +1,959 @@
+mbcache: remove
+
+From: Jan Kara <jack@suse.cz>
+
+Both ext2 and ext4 are now converted to mbcache2. Remove the old mbcache
+code.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+---
+ fs/Makefile             |   2 +-
+ fs/mbcache.c            | 858 ------------------------------------------------
+ include/linux/mbcache.h |  55 ----
+ 3 files changed, 1 insertion(+), 914 deletions(-)
+ delete mode 100644 fs/mbcache.c
+ delete mode 100644 include/linux/mbcache.h
+
+diff --git a/fs/Makefile b/fs/Makefile
+index 15b3d6c4e46a..59b844007fbc 100644
+--- a/fs/Makefile
++++ b/fs/Makefile
+@@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
+ obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
+ obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
+ 
+-obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o mbcache2.o
++obj-$(CONFIG_FS_MBCACHE)	+= mbcache2.o
+ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
+ obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
+ obj-$(CONFIG_COREDUMP)		+= coredump.o
+diff --git a/fs/mbcache.c b/fs/mbcache.c
+deleted file mode 100644
+index 187477ded6b3..000000000000
+--- a/fs/mbcache.c
++++ /dev/null
+@@ -1,858 +0,0 @@
+-/*
+- * linux/fs/mbcache.c
+- * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+- */
+-
+-/*
+- * Filesystem Meta Information Block Cache (mbcache)
+- *
+- * The mbcache caches blocks of block devices that need to be located
+- * by their device/block number, as well as by other criteria (such
+- * as the block's contents).
+- *
+- * There can only be one cache entry in a cache per device and block number.
+- * Additional indexes need not be unique in this sense. The number of
+- * additional indexes (=other criteria) can be hardwired at compile time
+- * or specified at cache create time.
+- *
+- * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
+- * in the cache. A valid entry is in the main hash tables of the cache,
+- * and may also be in the lru list. An invalid entry is not in any hashes
+- * or lists.
+- *
+- * A valid cache entry is only in the lru list if no handles refer to it.
+- * Invalid cache entries will be freed when the last handle to the cache
+- * entry is released. Entries that cannot be freed immediately are put
+- * back on the lru list.
+- */
+-
+-/*
+- * Lock descriptions and usage:
+- *
+- * Each hash chain of both the block and index hash tables now contains
+- * a built-in lock used to serialize accesses to the hash chain.
+- *
+- * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+- * are serialized via the global spinlock mb_cache_spinlock.
+- *
+- * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+- * accesses to its local data, such as e_used and e_queued.
+- *
+- * Lock ordering:
+- *
+- * Each block hash chain's lock has the highest lock order, followed by an
+- * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+- * lock), and mb_cach_spinlock, with the lowest order.  While holding
+- * either a block or index hash chain lock, a thread can acquire an
+- * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+- *
+- * Synchronization:
+- *
+- * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+- * index hash chian, it needs to lock the corresponding hash chain.  For each
+- * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+- * prevent either any simultaneous release or free on the entry and also
+- * to serialize accesses to either the e_used or e_queued member of the entry.
+- *
+- * To avoid having a dangling reference to an already freed
+- * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+- * block hash chain and also no longer being referenced, both e_used,
+- * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
+- * first removed from a block hash chain.
+- */
+-
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-
+-#include <linux/hash.h>
+-#include <linux/fs.h>
+-#include <linux/mm.h>
+-#include <linux/slab.h>
+-#include <linux/sched.h>
+-#include <linux/list_bl.h>
+-#include <linux/mbcache.h>
+-#include <linux/init.h>
+-#include <linux/blockgroup_lock.h>
+-#include <linux/log2.h>
+-
+-#ifdef MB_CACHE_DEBUG
+-# define mb_debug(f...) do { \
+-		printk(KERN_DEBUG f); \
+-		printk("\n"); \
+-	} while (0)
+-#define mb_assert(c) do { if (!(c)) \
+-		printk(KERN_ERR "assertion " #c " failed\n"); \
+-	} while(0)
+-#else
+-# define mb_debug(f...) do { } while(0)
+-# define mb_assert(c) do { } while(0)
+-#endif
+-#define mb_error(f...) do { \
+-		printk(KERN_ERR f); \
+-		printk("\n"); \
+-	} while(0)
+-
+-#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
+-
+-#define MB_CACHE_ENTRY_LOCK_BITS	ilog2(NR_BG_LOCKS)
+-#define	MB_CACHE_ENTRY_LOCK_INDEX(ce)			\
+-	(hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
+-
+-static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
+-static struct blockgroup_lock *mb_cache_bg_lock;
+-static struct kmem_cache *mb_cache_kmem_cache;
+-
+-MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
+-MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+-MODULE_LICENSE("GPL");
+-
+-EXPORT_SYMBOL(mb_cache_create);
+-EXPORT_SYMBOL(mb_cache_shrink);
+-EXPORT_SYMBOL(mb_cache_destroy);
+-EXPORT_SYMBOL(mb_cache_entry_alloc);
+-EXPORT_SYMBOL(mb_cache_entry_insert);
+-EXPORT_SYMBOL(mb_cache_entry_release);
+-EXPORT_SYMBOL(mb_cache_entry_free);
+-EXPORT_SYMBOL(mb_cache_entry_get);
+-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+-EXPORT_SYMBOL(mb_cache_entry_find_first);
+-EXPORT_SYMBOL(mb_cache_entry_find_next);
+-#endif
+-
+-/*
+- * Global data: list of all mbcache's, lru list, and a spinlock for
+- * accessing cache data structures on SMP machines. The lru list is
+- * global across all mbcaches.
+- */
+-
+-static LIST_HEAD(mb_cache_list);
+-static LIST_HEAD(mb_cache_lru_list);
+-static DEFINE_SPINLOCK(mb_cache_spinlock);
+-
+-static inline void
+-__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+-{
+-	spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+-		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+-}
+-
+-static inline void
+-__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+-{
+-	spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+-		MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+-}
+-
+-static inline int
+-__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
+-{
+-	return !hlist_bl_unhashed(&ce->e_block_list);
+-}
+-
+-
+-static inline void
+-__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
+-{
+-	if (__mb_cache_entry_is_block_hashed(ce))
+-		hlist_bl_del_init(&ce->e_block_list);
+-}
+-
+-static inline int
+-__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+-{
+-	return !hlist_bl_unhashed(&ce->e_index.o_list);
+-}
+-
+-static inline void
+-__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+-{
+-	if (__mb_cache_entry_is_index_hashed(ce))
+-		hlist_bl_del_init(&ce->e_index.o_list);
+-}
+-
+-/*
+- * __mb_cache_entry_unhash_unlock()
+- *
+- * This function is called to unhash both the block and index hash
+- * chain.
+- * It assumes both the block and index hash chain is locked upon entry.
+- * It also unlock both hash chains both exit
+- */
+-static inline void
+-__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+-{
+-	__mb_cache_entry_unhash_index(ce);
+-	hlist_bl_unlock(ce->e_index_hash_p);
+-	__mb_cache_entry_unhash_block(ce);
+-	hlist_bl_unlock(ce->e_block_hash_p);
+-}
+-
+-static void
+-__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
+-{
+-	struct mb_cache *cache = ce->e_cache;
+-
+-	mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
+-	kmem_cache_free(cache->c_entry_cache, ce);
+-	atomic_dec(&cache->c_entry_count);
+-}
+-
+-static void
+-__mb_cache_entry_release(struct mb_cache_entry *ce)
+-{
+-	/* First lock the entry to serialize access to its local data. */
+-	__spin_lock_mb_cache_entry(ce);
+-	/* Wake up all processes queuing for this cache entry. */
+-	if (ce->e_queued)
+-		wake_up_all(&mb_cache_queue);
+-	if (ce->e_used >= MB_CACHE_WRITER)
+-		ce->e_used -= MB_CACHE_WRITER;
+-	/*
+-	 * Make sure that all cache entries on lru_list have
+-	 * both e_used and e_qued of 0s.
+-	 */
+-	ce->e_used--;
+-	if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
+-		if (!__mb_cache_entry_is_block_hashed(ce)) {
+-			__spin_unlock_mb_cache_entry(ce);
+-			goto forget;
+-		}
+-		/*
+-		 * Need access to lru list, first drop entry lock,
+-		 * then reacquire the lock in the proper order.
+-		 */
+-		spin_lock(&mb_cache_spinlock);
+-		if (list_empty(&ce->e_lru_list))
+-			list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+-		spin_unlock(&mb_cache_spinlock);
+-	}
+-	__spin_unlock_mb_cache_entry(ce);
+-	return;
+-forget:
+-	mb_assert(list_empty(&ce->e_lru_list));
+-	__mb_cache_entry_forget(ce, GFP_KERNEL);
+-}
+-
+-/*
+- * mb_cache_shrink_scan()  memory pressure callback
+- *
+- * This function is called by the kernel memory management when memory
+- * gets low.
+- *
+- * @shrink: (ignored)
+- * @sc: shrink_control passed from reclaim
+- *
+- * Returns the number of objects freed.
+- */
+-static unsigned long
+-mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+-{
+-	LIST_HEAD(free_list);
+-	struct mb_cache_entry *entry, *tmp;
+-	int nr_to_scan = sc->nr_to_scan;
+-	gfp_t gfp_mask = sc->gfp_mask;
+-	unsigned long freed = 0;
+-
+-	mb_debug("trying to free %d entries", nr_to_scan);
+-	spin_lock(&mb_cache_spinlock);
+-	while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
+-		struct mb_cache_entry *ce =
+-			list_entry(mb_cache_lru_list.next,
+-				struct mb_cache_entry, e_lru_list);
+-		list_del_init(&ce->e_lru_list);
+-		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
+-			continue;
+-		spin_unlock(&mb_cache_spinlock);
+-		/* Prevent any find or get operation on the entry */
+-		hlist_bl_lock(ce->e_block_hash_p);
+-		hlist_bl_lock(ce->e_index_hash_p);
+-		/* Ignore if it is touched by a find/get */
+-		if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+-			!list_empty(&ce->e_lru_list)) {
+-			hlist_bl_unlock(ce->e_index_hash_p);
+-			hlist_bl_unlock(ce->e_block_hash_p);
+-			spin_lock(&mb_cache_spinlock);
+-			continue;
+-		}
+-		__mb_cache_entry_unhash_unlock(ce);
+-		list_add_tail(&ce->e_lru_list, &free_list);
+-		spin_lock(&mb_cache_spinlock);
+-	}
+-	spin_unlock(&mb_cache_spinlock);
+-
+-	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+-		__mb_cache_entry_forget(entry, gfp_mask);
+-		freed++;
+-	}
+-	return freed;
+-}
+-
+-static unsigned long
+-mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+-{
+-	struct mb_cache *cache;
+-	unsigned long count = 0;
+-
+-	spin_lock(&mb_cache_spinlock);
+-	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
+-		mb_debug("cache %s (%d)", cache->c_name,
+-			  atomic_read(&cache->c_entry_count));
+-		count += atomic_read(&cache->c_entry_count);
+-	}
+-	spin_unlock(&mb_cache_spinlock);
+-
+-	return vfs_pressure_ratio(count);
+-}
+-
+-static struct shrinker mb_cache_shrinker = {
+-	.count_objects = mb_cache_shrink_count,
+-	.scan_objects = mb_cache_shrink_scan,
+-	.seeks = DEFAULT_SEEKS,
+-};
+-
+-/*
+- * mb_cache_create()  create a new cache
+- *
+- * All entries in one cache are equal size. Cache entries may be from
+- * multiple devices. If this is the first mbcache created, registers
+- * the cache with kernel memory management. Returns NULL if no more
+- * memory was available.
+- *
+- * @name: name of the cache (informal)
+- * @bucket_bits: log2(number of hash buckets)
+- */
+-struct mb_cache *
+-mb_cache_create(const char *name, int bucket_bits)
+-{
+-	int n, bucket_count = 1 << bucket_bits;
+-	struct mb_cache *cache = NULL;
+-
+-	if (!mb_cache_bg_lock) {
+-		mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+-			GFP_KERNEL);
+-		if (!mb_cache_bg_lock)
+-			return NULL;
+-		bgl_lock_init(mb_cache_bg_lock);
+-	}
+-
+-	cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
+-	if (!cache)
+-		return NULL;
+-	cache->c_name = name;
+-	atomic_set(&cache->c_entry_count, 0);
+-	cache->c_bucket_bits = bucket_bits;
+-	cache->c_block_hash = kmalloc(bucket_count *
+-		sizeof(struct hlist_bl_head), GFP_KERNEL);
+-	if (!cache->c_block_hash)
+-		goto fail;
+-	for (n=0; n<bucket_count; n++)
+-		INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
+-	cache->c_index_hash = kmalloc(bucket_count *
+-		sizeof(struct hlist_bl_head), GFP_KERNEL);
+-	if (!cache->c_index_hash)
+-		goto fail;
+-	for (n=0; n<bucket_count; n++)
+-		INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
+-	if (!mb_cache_kmem_cache) {
+-		mb_cache_kmem_cache = kmem_cache_create(name,
+-			sizeof(struct mb_cache_entry), 0,
+-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+-		if (!mb_cache_kmem_cache)
+-			goto fail2;
+-	}
+-	cache->c_entry_cache = mb_cache_kmem_cache;
+-
+-	/*
+-	 * Set an upper limit on the number of cache entries so that the hash
+-	 * chains won't grow too long.
+-	 */
+-	cache->c_max_entries = bucket_count << 4;
+-
+-	spin_lock(&mb_cache_spinlock);
+-	list_add(&cache->c_cache_list, &mb_cache_list);
+-	spin_unlock(&mb_cache_spinlock);
+-	return cache;
+-
+-fail2:
+-	kfree(cache->c_index_hash);
+-
+-fail:
+-	kfree(cache->c_block_hash);
+-	kfree(cache);
+-	return NULL;
+-}
+-
+-
+-/*
+- * mb_cache_shrink()
+- *
+- * Removes all cache entries of a device from the cache. All cache entries
+- * currently in use cannot be freed, and thus remain in the cache. All others
+- * are freed.
+- *
+- * @bdev: which device's cache entries to shrink
+- */
+-void
+-mb_cache_shrink(struct block_device *bdev)
+-{
+-	LIST_HEAD(free_list);
+-	struct list_head *l;
+-	struct mb_cache_entry *ce, *tmp;
+-
+-	l = &mb_cache_lru_list;
+-	spin_lock(&mb_cache_spinlock);
+-	while (!list_is_last(l, &mb_cache_lru_list)) {
+-		l = l->next;
+-		ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+-		if (ce->e_bdev == bdev) {
+-			list_del_init(&ce->e_lru_list);
+-			if (ce->e_used || ce->e_queued ||
+-				atomic_read(&ce->e_refcnt))
+-				continue;
+-			spin_unlock(&mb_cache_spinlock);
+-			/*
+-			 * Prevent any find or get operation on the entry.
+-			 */
+-			hlist_bl_lock(ce->e_block_hash_p);
+-			hlist_bl_lock(ce->e_index_hash_p);
+-			/* Ignore if it is touched by a find/get */
+-			if (ce->e_used || ce->e_queued ||
+-				atomic_read(&ce->e_refcnt) ||
+-				!list_empty(&ce->e_lru_list)) {
+-				hlist_bl_unlock(ce->e_index_hash_p);
+-				hlist_bl_unlock(ce->e_block_hash_p);
+-				l = &mb_cache_lru_list;
+-				spin_lock(&mb_cache_spinlock);
+-				continue;
+-			}
+-			__mb_cache_entry_unhash_unlock(ce);
+-			mb_assert(!(ce->e_used || ce->e_queued ||
+-				atomic_read(&ce->e_refcnt)));
+-			list_add_tail(&ce->e_lru_list, &free_list);
+-			l = &mb_cache_lru_list;
+-			spin_lock(&mb_cache_spinlock);
+-		}
+-	}
+-	spin_unlock(&mb_cache_spinlock);
+-
+-	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+-		__mb_cache_entry_forget(ce, GFP_KERNEL);
+-	}
+-}
+-
+-
+-/*
+- * mb_cache_destroy()
+- *
+- * Shrinks the cache to its minimum possible size (hopefully 0 entries),
+- * and then destroys it. If this was the last mbcache, un-registers the
+- * mbcache from kernel memory management.
+- */
+-void
+-mb_cache_destroy(struct mb_cache *cache)
+-{
+-	LIST_HEAD(free_list);
+-	struct mb_cache_entry *ce, *tmp;
+-
+-	spin_lock(&mb_cache_spinlock);
+-	list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
+-		if (ce->e_cache == cache)
+-			list_move_tail(&ce->e_lru_list, &free_list);
+-	}
+-	list_del(&cache->c_cache_list);
+-	spin_unlock(&mb_cache_spinlock);
+-
+-	list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+-		list_del_init(&ce->e_lru_list);
+-		/*
+-		 * Prevent any find or get operation on the entry.
+-		 */
+-		hlist_bl_lock(ce->e_block_hash_p);
+-		hlist_bl_lock(ce->e_index_hash_p);
+-		mb_assert(!(ce->e_used || ce->e_queued ||
+-			atomic_read(&ce->e_refcnt)));
+-		__mb_cache_entry_unhash_unlock(ce);
+-		__mb_cache_entry_forget(ce, GFP_KERNEL);
+-	}
+-
+-	if (atomic_read(&cache->c_entry_count) > 0) {
+-		mb_error("cache %s: %d orphaned entries",
+-			  cache->c_name,
+-			  atomic_read(&cache->c_entry_count));
+-	}
+-
+-	if (list_empty(&mb_cache_list)) {
+-		kmem_cache_destroy(mb_cache_kmem_cache);
+-		mb_cache_kmem_cache = NULL;
+-	}
+-	kfree(cache->c_index_hash);
+-	kfree(cache->c_block_hash);
+-	kfree(cache);
+-}
+-
+-/*
+- * mb_cache_entry_alloc()
+- *
+- * Allocates a new cache entry. The new entry will not be valid initially,
+- * and thus cannot be looked up yet. It should be filled with data, and
+- * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
+- * if no more memory was available.
+- */
+-struct mb_cache_entry *
+-mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
+-{
+-	struct mb_cache_entry *ce;
+-
+-	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+-		struct list_head *l;
+-
+-		l = &mb_cache_lru_list;
+-		spin_lock(&mb_cache_spinlock);
+-		while (!list_is_last(l, &mb_cache_lru_list)) {
+-			l = l->next;
+-			ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+-			if (ce->e_cache == cache) {
+-				list_del_init(&ce->e_lru_list);
+-				if (ce->e_used || ce->e_queued ||
+-					atomic_read(&ce->e_refcnt))
+-					continue;
+-				spin_unlock(&mb_cache_spinlock);
+-				/*
+-				 * Prevent any find or get operation on the
+-				 * entry.
+-				 */
+-				hlist_bl_lock(ce->e_block_hash_p);
+-				hlist_bl_lock(ce->e_index_hash_p);
+-				/* Ignore if it is touched by a find/get */
+-				if (ce->e_used || ce->e_queued ||
+-					atomic_read(&ce->e_refcnt) ||
+-					!list_empty(&ce->e_lru_list)) {
+-					hlist_bl_unlock(ce->e_index_hash_p);
+-					hlist_bl_unlock(ce->e_block_hash_p);
+-					l = &mb_cache_lru_list;
+-					spin_lock(&mb_cache_spinlock);
+-					continue;
+-				}
+-				mb_assert(list_empty(&ce->e_lru_list));
+-				mb_assert(!(ce->e_used || ce->e_queued ||
+-					atomic_read(&ce->e_refcnt)));
+-				__mb_cache_entry_unhash_unlock(ce);
+-				goto found;
+-			}
+-		}
+-		spin_unlock(&mb_cache_spinlock);
+-	}
+-
+-	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+-	if (!ce)
+-		return NULL;
+-	atomic_inc(&cache->c_entry_count);
+-	INIT_LIST_HEAD(&ce->e_lru_list);
+-	INIT_HLIST_BL_NODE(&ce->e_block_list);
+-	INIT_HLIST_BL_NODE(&ce->e_index.o_list);
+-	ce->e_cache = cache;
+-	ce->e_queued = 0;
+-	atomic_set(&ce->e_refcnt, 0);
+-found:
+-	ce->e_block_hash_p = &cache->c_block_hash[0];
+-	ce->e_index_hash_p = &cache->c_index_hash[0];
+-	ce->e_used = 1 + MB_CACHE_WRITER;
+-	return ce;
+-}
+-
+-
+-/*
+- * mb_cache_entry_insert()
+- *
+- * Inserts an entry that was allocated using mb_cache_entry_alloc() into
+- * the cache. After this, the cache entry can be looked up, but is not yet
+- * in the lru list as the caller still holds a handle to it. Returns 0 on
+- * success, or -EBUSY if a cache entry for that device + inode exists
+- * already (this may happen after a failed lookup, but when another process
+- * has inserted the same cache entry in the meantime).
+- *
+- * @bdev: device the cache entry belongs to
+- * @block: block number
+- * @key: lookup key
+- */
+-int
+-mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
+-		      sector_t block, unsigned int key)
+-{
+-	struct mb_cache *cache = ce->e_cache;
+-	unsigned int bucket;
+-	struct hlist_bl_node *l;
+-	struct hlist_bl_head *block_hash_p;
+-	struct hlist_bl_head *index_hash_p;
+-	struct mb_cache_entry *lce;
+-
+-	mb_assert(ce);
+-	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
+-			   cache->c_bucket_bits);
+-	block_hash_p = &cache->c_block_hash[bucket];
+-	hlist_bl_lock(block_hash_p);
+-	hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
+-		if (lce->e_bdev == bdev && lce->e_block == block) {
+-			hlist_bl_unlock(block_hash_p);
+-			return -EBUSY;
+-		}
+-	}
+-	mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+-	__mb_cache_entry_unhash_block(ce);
+-	__mb_cache_entry_unhash_index(ce);
+-	ce->e_bdev = bdev;
+-	ce->e_block = block;
+-	ce->e_block_hash_p = block_hash_p;
+-	ce->e_index.o_key = key;
+-	hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+-	hlist_bl_unlock(block_hash_p);
+-	bucket = hash_long(key, cache->c_bucket_bits);
+-	index_hash_p = &cache->c_index_hash[bucket];
+-	hlist_bl_lock(index_hash_p);
+-	ce->e_index_hash_p = index_hash_p;
+-	hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
+-	hlist_bl_unlock(index_hash_p);
+-	return 0;
+-}
+-
+-
+-/*
+- * mb_cache_entry_release()
+- *
+- * Release a handle to a cache entry. When the last handle to a cache entry
+- * is released it is either freed (if it is invalid) or otherwise inserted
+- * in to the lru list.
+- */
+-void
+-mb_cache_entry_release(struct mb_cache_entry *ce)
+-{
+-	__mb_cache_entry_release(ce);
+-}
+-
+-
+-/*
+- * mb_cache_entry_free()
+- *
+- */
+-void
+-mb_cache_entry_free(struct mb_cache_entry *ce)
+-{
+-	mb_assert(ce);
+-	mb_assert(list_empty(&ce->e_lru_list));
+-	hlist_bl_lock(ce->e_index_hash_p);
+-	__mb_cache_entry_unhash_index(ce);
+-	hlist_bl_unlock(ce->e_index_hash_p);
+-	hlist_bl_lock(ce->e_block_hash_p);
+-	__mb_cache_entry_unhash_block(ce);
+-	hlist_bl_unlock(ce->e_block_hash_p);
+-	__mb_cache_entry_release(ce);
+-}
+-
+-
+-/*
+- * mb_cache_entry_get()
+- *
+- * Get a cache entry  by device / block number. (There can only be one entry
+- * in the cache per device and block.) Returns NULL if no such cache entry
+- * exists. The returned cache entry is locked for exclusive access ("single
+- * writer").
+- */
+-struct mb_cache_entry *
+-mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
+-		   sector_t block)
+-{
+-	unsigned int bucket;
+-	struct hlist_bl_node *l;
+-	struct mb_cache_entry *ce;
+-	struct hlist_bl_head *block_hash_p;
+-
+-	bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
+-			   cache->c_bucket_bits);
+-	block_hash_p = &cache->c_block_hash[bucket];
+-	/* First serialize access to the block corresponding hash chain. */
+-	hlist_bl_lock(block_hash_p);
+-	hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+-		mb_assert(ce->e_block_hash_p == block_hash_p);
+-		if (ce->e_bdev == bdev && ce->e_block == block) {
+-			/*
+-			 * Prevent a free from removing the entry.
+-			 */
+-			atomic_inc(&ce->e_refcnt);
+-			hlist_bl_unlock(block_hash_p);
+-			__spin_lock_mb_cache_entry(ce);
+-			atomic_dec(&ce->e_refcnt);
+-			if (ce->e_used > 0) {
+-				DEFINE_WAIT(wait);
+-				while (ce->e_used > 0) {
+-					ce->e_queued++;
+-					prepare_to_wait(&mb_cache_queue, &wait,
+-							TASK_UNINTERRUPTIBLE);
+-					__spin_unlock_mb_cache_entry(ce);
+-					schedule();
+-					__spin_lock_mb_cache_entry(ce);
+-					ce->e_queued--;
+-				}
+-				finish_wait(&mb_cache_queue, &wait);
+-			}
+-			ce->e_used += 1 + MB_CACHE_WRITER;
+-			__spin_unlock_mb_cache_entry(ce);
+-
+-			if (!list_empty(&ce->e_lru_list)) {
+-				spin_lock(&mb_cache_spinlock);
+-				list_del_init(&ce->e_lru_list);
+-				spin_unlock(&mb_cache_spinlock);
+-			}
+-			if (!__mb_cache_entry_is_block_hashed(ce)) {
+-				__mb_cache_entry_release(ce);
+-				return NULL;
+-			}
+-			return ce;
+-		}
+-	}
+-	hlist_bl_unlock(block_hash_p);
+-	return NULL;
+-}
+-
+-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+-
+-static struct mb_cache_entry *
+-__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
+-		      struct block_device *bdev, unsigned int key)
+-{
+-
+-	/* The index hash chain is alredy acquire by caller. */
+-	while (l != NULL) {
+-		struct mb_cache_entry *ce =
+-			hlist_bl_entry(l, struct mb_cache_entry,
+-				e_index.o_list);
+-		mb_assert(ce->e_index_hash_p == head);
+-		if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
+-			/*
+-			 * Prevent a free from removing the entry.
+-			 */
+-			atomic_inc(&ce->e_refcnt);
+-			hlist_bl_unlock(head);
+-			__spin_lock_mb_cache_entry(ce);
+-			atomic_dec(&ce->e_refcnt);
+-			ce->e_used++;
+-			/* Incrementing before holding the lock gives readers
+-			   priority over writers. */
+-			if (ce->e_used >= MB_CACHE_WRITER) {
+-				DEFINE_WAIT(wait);
+-
+-				while (ce->e_used >= MB_CACHE_WRITER) {
+-					ce->e_queued++;
+-					prepare_to_wait(&mb_cache_queue, &wait,
+-							TASK_UNINTERRUPTIBLE);
+-					__spin_unlock_mb_cache_entry(ce);
+-					schedule();
+-					__spin_lock_mb_cache_entry(ce);
+-					ce->e_queued--;
+-				}
+-				finish_wait(&mb_cache_queue, &wait);
+-			}
+-			__spin_unlock_mb_cache_entry(ce);
+-			if (!list_empty(&ce->e_lru_list)) {
+-				spin_lock(&mb_cache_spinlock);
+-				list_del_init(&ce->e_lru_list);
+-				spin_unlock(&mb_cache_spinlock);
+-			}
+-			if (!__mb_cache_entry_is_block_hashed(ce)) {
+-				__mb_cache_entry_release(ce);
+-				return ERR_PTR(-EAGAIN);
+-			}
+-			return ce;
+-		}
+-		l = l->next;
+-	}
+-	hlist_bl_unlock(head);
+-	return NULL;
+-}
+-
+-
+-/*
+- * mb_cache_entry_find_first()
+- *
+- * Find the first cache entry on a given device with a certain key in
+- * an additional index. Additional matches can be found with
+- * mb_cache_entry_find_next(). Returns NULL if no match was found. The
+- * returned cache entry is locked for shared access ("multiple readers").
+- *
+- * @cache: the cache to search
+- * @bdev: the device the cache entry should belong to
+- * @key: the key in the index
+- */
+-struct mb_cache_entry *
+-mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
+-			  unsigned int key)
+-{
+-	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+-	struct hlist_bl_node *l;
+-	struct mb_cache_entry *ce = NULL;
+-	struct hlist_bl_head *index_hash_p;
+-
+-	index_hash_p = &cache->c_index_hash[bucket];
+-	hlist_bl_lock(index_hash_p);
+-	if (!hlist_bl_empty(index_hash_p)) {
+-		l = hlist_bl_first(index_hash_p);
+-		ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+-	} else
+-		hlist_bl_unlock(index_hash_p);
+-	return ce;
+-}
+-
+-
+-/*
+- * mb_cache_entry_find_next()
+- *
+- * Find the next cache entry on a given device with a certain key in an
+- * additional index. Returns NULL if no match could be found. The previous
+- * entry is atomatically released, so that mb_cache_entry_find_next() can
+- * be called like this:
+- *
+- * entry = mb_cache_entry_find_first();
+- * while (entry) {
+- * 	...
+- *	entry = mb_cache_entry_find_next(entry, ...);
+- * }
+- *
+- * @prev: The previous match
+- * @bdev: the device the cache entry should belong to
+- * @key: the key in the index
+- */
+-struct mb_cache_entry *
+-mb_cache_entry_find_next(struct mb_cache_entry *prev,
+-			 struct block_device *bdev, unsigned int key)
+-{
+-	struct mb_cache *cache = prev->e_cache;
+-	unsigned int bucket = hash_long(key, cache->c_bucket_bits);
+-	struct hlist_bl_node *l;
+-	struct mb_cache_entry *ce;
+-	struct hlist_bl_head *index_hash_p;
+-
+-	index_hash_p = &cache->c_index_hash[bucket];
+-	mb_assert(prev->e_index_hash_p == index_hash_p);
+-	hlist_bl_lock(index_hash_p);
+-	mb_assert(!hlist_bl_empty(index_hash_p));
+-	l = prev->e_index.o_list.next;
+-	ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+-	__mb_cache_entry_release(prev);
+-	return ce;
+-}
+-
+-#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
+-
+-static int __init init_mbcache(void)
+-{
+-	register_shrinker(&mb_cache_shrinker);
+-	return 0;
+-}
+-
+-static void __exit exit_mbcache(void)
+-{
+-	unregister_shrinker(&mb_cache_shrinker);
+-}
+-
+-module_init(init_mbcache)
+-module_exit(exit_mbcache)
+-
+diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
+deleted file mode 100644
+index 6a392e7a723a..000000000000
+--- a/include/linux/mbcache.h
++++ /dev/null
+@@ -1,55 +0,0 @@
+-/*
+-  File: linux/mbcache.h
+-
+-  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+-*/
+-struct mb_cache_entry {
+-	struct list_head		e_lru_list;
+-	struct mb_cache			*e_cache;
+-	unsigned short			e_used;
+-	unsigned short			e_queued;
+-	atomic_t			e_refcnt;
+-	struct block_device		*e_bdev;
+-	sector_t			e_block;
+-	struct hlist_bl_node		e_block_list;
+-	struct {
+-		struct hlist_bl_node	o_list;
+-		unsigned int		o_key;
+-	} e_index;
+-	struct hlist_bl_head		*e_block_hash_p;
+-	struct hlist_bl_head		*e_index_hash_p;
+-};
+-
+-struct mb_cache {
+-	struct list_head		c_cache_list;
+-	const char			*c_name;
+-	atomic_t			c_entry_count;
+-	int				c_max_entries;
+-	int				c_bucket_bits;
+-	struct kmem_cache		*c_entry_cache;
+-	struct hlist_bl_head		*c_block_hash;
+-	struct hlist_bl_head		*c_index_hash;
+-};
+-
+-/* Functions on caches */
+-
+-struct mb_cache *mb_cache_create(const char *, int);
+-void mb_cache_shrink(struct block_device *);
+-void mb_cache_destroy(struct mb_cache *);
+-
+-/* Functions on cache entries */
+-
+-struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t);
+-int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *,
+-			  sector_t, unsigned int);
+-void mb_cache_entry_release(struct mb_cache_entry *);
+-void mb_cache_entry_free(struct mb_cache_entry *);
+-struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *,
+-					  struct block_device *,
+-					  sector_t);
+-struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
+-						 struct block_device *, 
+-						 unsigned int);
+-struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *,
+-						struct block_device *, 
+-						unsigned int);
+-- 
+2.1.4
+
+
diff --git a/series b/series
index c17b05e6..331afd34 100644
--- a/series
+++ b/series
@@ -12,6 +12,15 @@ fix-memleak-in-ext4_readdir
 fix-bh-b_state-corruption
 fix-crashes-in-dioread_nolock-mode
 
+# Above has been pushed to Linus
+
+reimplement-mbcache
+ext4-convert-to-mbcache2
+ext2-convert-to-mbcache2
+remove-mbcache
+mbcache2-limit-cache-size
+mbcache2-use-referenced-bit-instead-of-LRU
+
 ##########################################
 # unstable patches
 ####################################################
diff --git a/timestamps b/timestamps
index b2b84df8..da9417e4 100755
--- a/timestamps
+++ b/timestamps
@@ -38,7 +38,13 @@ touch -d @1455258180 remove-unused-parameter-newblock
 touch -d @1455600079 stable-boundary
 touch -d @1455859105 fix-bh-b_state-corruption
 touch -d @1455860001 fix-crashes-in-dioread_nolock-mode
-touch -d @1455860008 status
-touch -d @1455860037 series
 touch -d @1455908377 fix-memleak-in-ext4_readdir
-touch -d @1455915067 timestamps
+touch -d @1455915167 reimplement-mbcache
+touch -d @1455915233 ext4-convert-to-mbcache2
+touch -d @1455915523 ext2-convert-to-mbcache2
+touch -d @1455915619 remove-mbcache
+touch -d @1455915679 mbcache2-limit-cache-size
+touch -d @1455915868 mbcache2-use-referenced-bit-instead-of-LRU
+touch -d @1455915890 series
+touch -d @1455915931 status
+touch -d @1456033604 timestamps
-- 
2.11.4.GIT