1 ext4: xattr inode deduplication
3 From: Tahsin Erdogan <tahsin@google.com>
5 Ext4 now supports xattr values that are up to 64k in size (vfs limit).
6 Large xattr values are stored in external inodes each one holding a
7 single value. Once written the data blocks of these inodes are immutable.
9 The real world use cases are expected to have a lot of value duplication
10 such as inherited acls etc. To reduce data duplication on disk, this patch
11 implements a deduplicator that allows sharing of xattr inodes.
13 The deduplication is based on an in-memory hash lookup that is a best
14 effort sharing scheme. When a xattr inode is read from disk (i.e.
15 getxattr() call), its crc32c hash is added to a hash table. Before
16 creating a new xattr inode for a value being set, the hash table is
17 checked to see if an existing inode holds an identical value. If such an
18 inode is found, the ref count on that inode is incremented. On value
19 removal the ref count is decremented and if it reaches zero the inode is
22 The quota charging for such inodes is manually managed. Every reference
23 holder is charged the full size as if there was no sharing happening.
24 This is consistent with how xattr blocks are also charged.
26 Signed-off-by: Tahsin Erdogan <tahsin@google.com>
27 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
30 - Fixed error message "Failed to create an s_ea_inode_cache"
32 - made ext4_meta_trans_blocks() static again since there are no
33 remaining users outside of inode.c
34 - initialize sbi->s_csum_seed when ea_inode feature is enabled
35 - use l_i_version to hold lower 32 bits of the xattr ref count.
36 This avoids clashes with old implementations which use i_mtime.
37 Since l_i_version is not available in HURD_COMPAT mode, fail mount
38 request when both ea_inode feature and HURD_COMPAT are set.
39 - when hash validation fails, fall back to old implementation
40 which has a backref to parent.
41 - fixed checkpatch.pl warning about using unsigned alone
44 - eliminated xattr entry in the xattr inode to avoid complexity and
45 recursion in xattr update path. Now the ref count and hash are stored
46 in i_[c/m/a]time.tv_sec fields.
47 - some clean up in ext4_xattr_set_entry() to reduce code duplication and
51 - use s_csum_seed for hash calculations when available
52 - return error on stored vs calculated hash mismatch
55 - make dependency on crc32c dynamic
56 - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
57 they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
60 fs/ext4/ext4.h | 23 +-
61 fs/ext4/inode.c | 13 +-
62 fs/ext4/super.c | 37 +-
63 fs/ext4/xattr.c | 1038 +++++++++++++++++++++++++++++++++++++++++--------------
64 fs/ext4/xattr.h | 17 +-
66 7 files changed, 848 insertions(+), 294 deletions(-)
68 diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
69 index 74f7ac539e00..8db03e5c78bc 100644
72 @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
76 - credits = ext4_xattr_set_credits(inode, acl_size);
77 + error = ext4_xattr_set_credits(inode, acl_size, &credits);
81 handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
83 return PTR_ERR(handle);
84 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
85 index 3b02bd897b61..dc06287ddec8 100644
88 @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
90 struct ext4_es_stats s_es_stats;
91 struct mb_cache *s_ea_block_cache;
92 + struct mb_cache *s_ea_inode_cache;
93 spinlock_t s_es_lock ____cacheline_aligned_in_smp;
95 /* Ratelimit ext4 messages. */
96 @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
97 return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
100 -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
101 +static inline bool ext4_is_quota_file(struct inode *inode)
103 + return IS_NOQUOTA(inode) &&
104 + !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
108 * This structure is stuffed into the struct file's private_data field
109 @@ -2482,7 +2487,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
110 extern void ext4_set_inode_flags(struct inode *);
111 extern int ext4_alloc_da_blocks(struct inode *inode);
112 extern void ext4_set_aops(struct inode *inode);
113 -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
114 extern int ext4_writepage_trans_blocks(struct inode *);
115 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
116 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
117 @@ -2709,19 +2713,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
118 extern int ext4_register_li_request(struct super_block *sb,
119 ext4_group_t first_not_zeroed);
121 -static inline int ext4_has_group_desc_csum(struct super_block *sb)
123 - return ext4_has_feature_gdt_csum(sb) ||
124 - EXT4_SB(sb)->s_chksum_driver != NULL;
127 static inline int ext4_has_metadata_csum(struct super_block *sb)
129 WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
130 !EXT4_SB(sb)->s_chksum_driver);
132 - return (EXT4_SB(sb)->s_chksum_driver != NULL);
133 + return ext4_has_feature_metadata_csum(sb) &&
134 + (EXT4_SB(sb)->s_chksum_driver != NULL);
137 +static inline int ext4_has_group_desc_csum(struct super_block *sb)
139 + return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
142 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
144 return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
145 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
146 index cd007f9757d1..ea95bd9eab81 100644
147 --- a/fs/ext4/inode.c
148 +++ b/fs/ext4/inode.c
149 @@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
150 unsigned int length);
151 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
152 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
153 +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
157 * Test whether an inode is a fast symlink.
158 @@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
161 ext4_set_inode_flags(inode);
162 - if (ei->i_flags & EXT4_EA_INODE_FL)
164 + if (ei->i_flags & EXT4_EA_INODE_FL) {
165 ext4_xattr_inode_set_class(inode);
168 + inode->i_flags |= S_NOQUOTA;
169 + inode_unlock(inode);
172 unlock_new_inode(inode);
175 @@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
177 * Also account for superblock, inode, quota and xattr blocks
179 -int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
180 +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
183 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
184 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
185 index 380389740575..d501f8256dc4 100644
186 --- a/fs/ext4/super.c
187 +++ b/fs/ext4/super.c
188 @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
189 invalidate_bdev(sbi->journal_bdev);
190 ext4_blkdev_remove(sbi);
192 + if (sbi->s_ea_inode_cache) {
193 + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
194 + sbi->s_ea_inode_cache = NULL;
196 if (sbi->s_ea_block_cache) {
197 ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
198 sbi->s_ea_block_cache = NULL;
199 @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
203 - credits = ext4_xattr_set_credits(inode, len);
204 + res = ext4_xattr_set_credits(inode, len, &credits);
208 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
210 return PTR_ERR(handle);
211 @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
214 /* Load the checksum driver */
215 - if (ext4_has_feature_metadata_csum(sb)) {
216 + if (ext4_has_feature_metadata_csum(sb) ||
217 + ext4_has_feature_ea_inode(sb)) {
218 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
219 if (IS_ERR(sbi->s_chksum_driver)) {
220 ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
221 @@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
222 /* Precompute checksum seed for all metadata */
223 if (ext4_has_feature_csum_seed(sb))
224 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
225 - else if (ext4_has_metadata_csum(sb))
226 + else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
227 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
230 @@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
231 "The Hurd can't support 64-bit file systems");
236 + * ea_inode feature uses l_i_version field which is not
237 + * available in HURD_COMPAT mode.
239 + if (ext4_has_feature_ea_inode(sb)) {
240 + ext4_msg(sb, KERN_ERR,
241 + "ea_inode feature is not supported for Hurd");
246 if (IS_EXT2_SB(sb)) {
247 @@ -4067,6 +4085,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
248 goto failed_mount_wq;
251 + if (ext4_has_feature_ea_inode(sb)) {
252 + sbi->s_ea_inode_cache = ext4_xattr_create_cache();
253 + if (!sbi->s_ea_inode_cache) {
254 + ext4_msg(sb, KERN_ERR,
255 + "Failed to create ea_inode_cache");
256 + goto failed_mount_wq;
260 if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
261 (blocksize != PAGE_SIZE)) {
262 ext4_msg(sb, KERN_ERR,
263 @@ -4296,6 +4323,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
264 if (EXT4_SB(sb)->rsv_conversion_wq)
265 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
267 + if (sbi->s_ea_inode_cache) {
268 + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
269 + sbi->s_ea_inode_cache = NULL;
271 if (sbi->s_ea_block_cache) {
272 ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
273 sbi->s_ea_block_cache = NULL;
274 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
275 index 94f04b9fb421..15c9f736dcc4 100644
276 --- a/fs/ext4/xattr.c
277 +++ b/fs/ext4/xattr.c
278 @@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
279 #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \
280 inode->i_sb->s_fs_info)->s_ea_block_cache)
282 +#define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \
283 + inode->i_sb->s_fs_info)->s_ea_inode_cache)
286 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
287 struct inode *inode);
288 @@ -280,15 +283,44 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
289 return cmp ? -ENODATA : 0;
293 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
295 + return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
298 +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
300 + return ((u64)ea_inode->i_ctime.tv_sec << 32) |
301 + ((u32)ea_inode->i_version);
304 +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
306 + ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
307 + ea_inode->i_version = (u32)ref_count;
310 +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
312 + return (u32)ea_inode->i_atime.tv_sec;
315 +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
317 + ea_inode->i_atime.tv_sec = hash;
321 * Read the EA value from an inode.
323 static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
325 unsigned long block = 0;
326 - struct buffer_head *bh = NULL;
327 + struct buffer_head *bh;
328 int blocksize = ea_inode->i_sb->s_blocksize;
329 size_t csize, copied = 0;
330 + void *copy_pos = buf;
332 while (copied < size) {
333 csize = (size - copied) > blocksize ? blocksize : size - copied;
334 @@ -298,10 +330,10 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
336 return -EFSCORRUPTED;
338 - memcpy(buf, bh->b_data, csize);
339 + memcpy(copy_pos, bh->b_data, csize);
347 @@ -317,29 +349,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
348 inode = ext4_iget(parent->i_sb, ea_ino);
350 err = PTR_ERR(inode);
351 - ext4_error(parent->i_sb, "error while reading EA inode %lu "
352 - "err=%d", ea_ino, err);
353 + ext4_error(parent->i_sb,
354 + "error while reading EA inode %lu err=%d", ea_ino,
359 if (is_bad_inode(inode)) {
360 - ext4_error(parent->i_sb, "error while reading EA inode %lu "
361 - "is_bad_inode", ea_ino);
362 + ext4_error(parent->i_sb,
363 + "error while reading EA inode %lu is_bad_inode",
369 - if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
370 - inode->i_generation != parent->i_generation) {
371 - ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
372 - "to parent is invalid.", ea_ino);
377 if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
378 - ext4_error(parent->i_sb, "EA inode %lu does not have "
379 - "EXT4_EA_INODE_FL flag set.\n", ea_ino);
380 + ext4_error(parent->i_sb,
381 + "EA inode %lu does not have EXT4_EA_INODE_FL flag",
386 @@ -351,6 +378,20 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
391 +ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size)
395 + /* Verify stored hash matches calculated hash. */
396 + hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
397 + if (hash != ext4_xattr_inode_get_hash(ea_inode))
398 + return -EFSCORRUPTED;
402 +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
405 * Read the value from the EA inode.
407 @@ -358,17 +399,53 @@ static int
408 ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
411 + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
412 struct inode *ea_inode;
416 - ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
419 + err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
425 - ret = ext4_xattr_inode_read(ea_inode, buffer, size);
427 + if (i_size_read(ea_inode) != size) {
428 + ext4_warning_inode(ea_inode,
429 + "ea_inode file size=%llu entry size=%zu",
430 + i_size_read(ea_inode), size);
431 + err = -EFSCORRUPTED;
436 + err = ext4_xattr_inode_read(ea_inode, buffer, size);
440 + err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size);
442 + * Compatibility check for old Lustre ea_inode implementation. Old
443 + * version does not have hash validation, but it has a backpointer
444 + * from ea_inode to the parent inode.
446 + if (err == -EFSCORRUPTED) {
447 + if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
448 + ea_inode->i_generation != inode->i_generation) {
449 + ext4_warning_inode(ea_inode,
450 + "EA inode hash validation failed");
453 + /* Do not add ea_inode to the cache. */
454 + ea_inode_cache = NULL;
458 + if (ea_inode_cache)
459 + mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
460 + ext4_xattr_inode_get_hash(ea_inode),
461 + ea_inode->i_ino, true /* reusable */);
468 @@ -656,6 +733,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
472 +static inline size_t round_up_cluster(struct inode *inode, size_t length)
474 + struct super_block *sb = inode->i_sb;
475 + size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
477 + size_t mask = ~(cluster_size - 1);
479 + return (length + cluster_size - 1) & mask;
482 +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
486 + err = dquot_alloc_inode(inode);
489 + err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
491 + dquot_free_inode(inode);
495 +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
497 + dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
498 + dquot_free_inode(inode);
501 +static int __ext4_xattr_set_credits(struct super_block *sb,
502 + struct buffer_head *block_bh,
509 + * 1) Owner inode update
510 + * 2) Ref count update on old xattr block
511 + * 3) new xattr block
512 + * 4) block bitmap update for new xattr block
513 + * 5) group descriptor for new xattr block
517 + /* We are done if ea_inode feature is not enabled. */
518 + if (!ext4_has_feature_ea_inode(sb))
521 + /* New ea_inode, inode map, block bitmap, group descriptor. */
525 + blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
527 + /* Indirection block or one level of extent tree. */
530 + /* Block bitmap and group descriptor updates for each block. */
531 + credits += blocks * 2;
533 + /* Blocks themselves. */
536 + /* Dereference ea_inode holding old xattr value.
537 + * Old ea_inode, inode map, block bitmap, group descriptor.
541 + /* Data blocks for old ea_inode. */
542 + blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
544 + /* Indirection block or one level of extent tree for old ea_inode. */
547 + /* Block bitmap and group descriptor updates for each block. */
548 + credits += blocks * 2;
550 + /* Quota updates. */
551 + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
553 + /* We may need to clone the existing xattr block in which case we need
554 + * to increment ref counts for existing ea_inodes referenced by it.
557 + struct ext4_xattr_entry *entry = BFIRST(block_bh);
559 + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
560 + if (entry->e_value_inum)
561 + /* Ref count update on ea_inode. */
567 static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
568 int credits, struct buffer_head *bh,
569 bool dirty, bool block_csum)
570 @@ -705,12 +877,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
574 +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
577 + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
578 + struct ext4_iloc iloc;
583 + inode_lock(ea_inode);
585 + ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
591 + ref_count = ext4_xattr_inode_get_ref(ea_inode);
592 + ref_count += ref_change;
593 + ext4_xattr_inode_set_ref(ea_inode, ref_count);
595 + if (ref_change > 0) {
596 + WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
597 + ea_inode->i_ino, ref_count);
599 + if (ref_count == 1) {
600 + WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
601 + ea_inode->i_ino, ea_inode->i_nlink);
603 + set_nlink(ea_inode, 1);
604 + ext4_orphan_del(handle, ea_inode);
606 + hash = ext4_xattr_inode_get_hash(ea_inode);
607 + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
609 + true /* reusable */);
612 + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
613 + ea_inode->i_ino, ref_count);
615 + if (ref_count == 0) {
616 + WARN_ONCE(ea_inode->i_nlink != 1,
617 + "EA inode %lu i_nlink=%u",
618 + ea_inode->i_ino, ea_inode->i_nlink);
620 + clear_nlink(ea_inode);
621 + ext4_orphan_add(handle, ea_inode);
623 + hash = ext4_xattr_inode_get_hash(ea_inode);
624 + mb_cache_entry_delete(ea_inode_cache, hash,
629 + ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
632 + ext4_warning_inode(ea_inode,
633 + "ext4_mark_iloc_dirty() failed ret=%d", ret);
636 + inode_unlock(ea_inode);
640 +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
642 + return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
645 +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
647 + return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
650 +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
651 + struct ext4_xattr_entry *first)
653 + struct inode *ea_inode;
654 + struct ext4_xattr_entry *entry;
655 + struct ext4_xattr_entry *failed_entry;
656 + unsigned int ea_ino;
657 + int err, saved_err;
659 + for (entry = first; !IS_LAST_ENTRY(entry);
660 + entry = EXT4_XATTR_NEXT(entry)) {
661 + if (!entry->e_value_inum)
663 + ea_ino = le32_to_cpu(entry->e_value_inum);
664 + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
667 + err = ext4_xattr_inode_inc_ref(handle, ea_inode);
669 + ext4_warning_inode(ea_inode, "inc ref error %d", err);
679 + failed_entry = entry;
681 + for (entry = first; entry != failed_entry;
682 + entry = EXT4_XATTR_NEXT(entry)) {
683 + if (!entry->e_value_inum)
685 + ea_ino = le32_to_cpu(entry->e_value_inum);
686 + err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
688 + ext4_warning(parent->i_sb,
689 + "cleanup ea_ino %u iget error %d", ea_ino,
693 + err = ext4_xattr_inode_dec_ref(handle, ea_inode);
695 + ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
703 -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
704 - struct buffer_head *bh,
705 - struct ext4_xattr_entry *first, bool block_csum,
706 - struct ext4_xattr_inode_array **ea_inode_array,
708 +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
709 + struct buffer_head *bh,
710 + struct ext4_xattr_entry *first, bool block_csum,
711 + struct ext4_xattr_inode_array **ea_inode_array,
712 + int extra_credits, bool skip_quota)
714 struct inode *ea_inode;
715 struct ext4_xattr_entry *entry;
716 @@ -747,10 +1047,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
720 - inode_lock(ea_inode);
721 - clear_nlink(ea_inode);
722 - ext4_orphan_add(handle, ea_inode);
723 - inode_unlock(ea_inode);
724 + err = ext4_xattr_inode_dec_ref(handle, ea_inode);
726 + ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
732 + ext4_xattr_inode_free_quota(parent,
733 + le32_to_cpu(entry->e_value_size));
736 * Forget about ea_inode within the same transaction that
737 @@ -784,7 +1090,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
740 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
741 - struct buffer_head *bh)
742 + struct buffer_head *bh,
743 + struct ext4_xattr_inode_array **ea_inode_array,
746 struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
748 @@ -807,6 +1115,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
749 mb_cache_entry_delete(ea_block_cache, hash, bh->b_blocknr);
753 + if (ext4_has_feature_ea_inode(inode->i_sb))
754 + ext4_xattr_inode_dec_ref_all(handle, inode, bh,
756 + true /* block_csum */,
759 + true /* skip_quota */);
760 ext4_free_blocks(handle, inode, bh, 0, 1,
761 EXT4_FREE_BLOCKS_METADATA |
762 EXT4_FREE_BLOCKS_FORGET);
763 @@ -878,8 +1194,8 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
765 struct buffer_head *bh = NULL;
766 unsigned long block = 0;
767 - unsigned blocksize = ea_inode->i_sb->s_blocksize;
768 - unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
769 + int blocksize = ea_inode->i_sb->s_blocksize;
770 + int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
771 int csize, wsize = 0;
774 @@ -947,7 +1263,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
775 * Create an inode to store the value of a large EA.
777 static struct inode *ext4_xattr_inode_create(handle_t *handle,
778 - struct inode *inode)
779 + struct inode *inode, u32 hash)
781 struct inode *ea_inode = NULL;
782 uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
783 @@ -965,67 +1281,115 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
784 ea_inode->i_fop = &ext4_file_operations;
785 ext4_set_aops(ea_inode);
786 ext4_xattr_inode_set_class(ea_inode);
787 - ea_inode->i_generation = inode->i_generation;
788 - EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
791 - * A back-pointer from EA inode to parent inode will be useful
794 - EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
795 unlock_new_inode(ea_inode);
796 - err = ext4_inode_attach_jinode(ea_inode);
797 + ext4_xattr_inode_set_ref(ea_inode, 1);
798 + ext4_xattr_inode_set_hash(ea_inode, hash);
799 + err = ext4_mark_inode_dirty(handle, ea_inode);
801 + err = ext4_inode_attach_jinode(ea_inode);
808 + * Xattr inodes are shared therefore quota charging is performed
809 + * at a higher level.
811 + dquot_free_inode(ea_inode);
812 + dquot_drop(ea_inode);
813 + inode_lock(ea_inode);
814 + ea_inode->i_flags |= S_NOQUOTA;
815 + inode_unlock(ea_inode);
822 - * Unlink the inode storing the value of the EA.
824 -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
825 +static struct inode *
826 +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
827 + size_t value_len, u32 hash)
829 - struct inode *ea_inode = NULL;
831 + struct inode *ea_inode;
832 + struct mb_cache_entry *ce;
833 + struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
836 - err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
839 + ce = mb_cache_entry_find_first(ea_inode_cache, hash);
843 - clear_nlink(ea_inode);
845 + ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
847 + mb_cache_entry_put(ea_inode_cache, ce);
853 + ea_inode = ext4_iget(inode->i_sb, ce->e_value);
854 + if (!IS_ERR(ea_inode) &&
855 + !is_bad_inode(ea_inode) &&
856 + (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
857 + i_size_read(ea_inode) == value_len &&
858 + !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
859 + !ext4_xattr_inode_verify_hash(ea_inode, ea_data,
861 + !memcmp(value, ea_data, value_len)) {
862 + mb_cache_entry_touch(ea_inode_cache, ce);
863 + mb_cache_entry_put(ea_inode_cache, ce);
868 + if (!IS_ERR(ea_inode))
870 + ce = mb_cache_entry_find_next(ea_inode_cache, ce);
877 * Add value of the EA in an inode.
879 -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
880 - unsigned long *ea_ino, const void *value,
882 +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
883 + const void *value, size_t value_len,
884 + struct inode **ret_inode)
886 struct inode *ea_inode;
890 + hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
891 + ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
893 + err = ext4_xattr_inode_inc_ref(handle, ea_inode);
899 + *ret_inode = ea_inode;
903 /* Create an inode for the EA value */
904 - ea_inode = ext4_xattr_inode_create(handle, inode);
905 + ea_inode = ext4_xattr_inode_create(handle, inode, hash);
906 if (IS_ERR(ea_inode))
907 return PTR_ERR(ea_inode);
909 err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
911 - clear_nlink(ea_inode);
913 - *ea_ino = ea_inode->i_ino;
915 + ext4_xattr_inode_dec_ref(handle, ea_inode);
921 + mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
922 + ea_inode->i_ino, true /* reusable */);
925 + *ret_inode = ea_inode;
929 static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
930 @@ -1033,9 +1397,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
931 handle_t *handle, struct inode *inode)
933 struct ext4_xattr_entry *last;
934 - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
935 + struct ext4_xattr_entry *here = s->here;
936 + size_t min_offs = s->end - s->base, name_len = strlen(i->name);
937 int in_inode = i->in_inode;
939 + struct inode *old_ea_inode = NULL;
940 + struct inode *new_ea_inode = NULL;
941 + size_t old_size, new_size;
944 + /* Space used by old and new values. */
945 + old_size = (!s->not_found && !here->e_value_inum) ?
946 + EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
947 + new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
950 + * Optimization for the simple case when old and new values have the
951 + * same padded sizes. Not applicable if external inodes are involved.
953 + if (new_size && new_size == old_size) {
954 + size_t offs = le16_to_cpu(here->e_value_offs);
955 + void *val = s->base + offs;
957 + here->e_value_size = cpu_to_le32(i->value_len);
958 + if (i->value == EXT4_ZERO_XATTR_VALUE) {
959 + memset(val, 0, new_size);
961 + memcpy(val, i->value, i->value_len);
962 + /* Clear padding bytes. */
963 + memset(val + i->value_len, 0, new_size - i->value_len);
968 /* Compute min_offs and last. */
970 @@ -1046,122 +1438,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
974 - free = min_offs - ((void *)last - s->base) - sizeof(__u32);
975 - if (!s->not_found) {
977 - !s->here->e_value_inum && s->here->e_value_size) {
978 - size_t size = le32_to_cpu(s->here->e_value_size);
979 - free += EXT4_XATTR_SIZE(size);
981 - free += EXT4_XATTR_LEN(name_len);
984 + /* Check whether we have enough space. */
986 - size_t value_len = EXT4_XATTR_SIZE(i->value_len);
991 + free = min_offs - ((void *)last - s->base) - sizeof(__u32);
993 + free += EXT4_XATTR_LEN(name_len) + old_size;
995 - if (free < EXT4_XATTR_LEN(name_len) + value_len)
997 + if (free < EXT4_XATTR_LEN(name_len) + new_size) {
1003 - if (i->value && s->not_found) {
1004 - /* Insert the new name. */
1005 - size_t size = EXT4_XATTR_LEN(name_len);
1006 - size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
1007 - memmove((void *)s->here + size, s->here, rest);
1008 - memset(s->here, 0, size);
1009 - s->here->e_name_index = i->name_index;
1010 - s->here->e_name_len = name_len;
1011 - memcpy(s->here->e_name, i->name, name_len);
1013 - if (!s->here->e_value_inum && s->here->e_value_size &&
1014 - s->here->e_value_offs > 0) {
1015 - void *first_val = s->base + min_offs;
1016 - size_t offs = le16_to_cpu(s->here->e_value_offs);
1017 - void *val = s->base + offs;
1018 - size_t size = EXT4_XATTR_SIZE(
1019 - le32_to_cpu(s->here->e_value_size));
1021 - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
1022 - /* The old and the new value have the same
1023 - size. Just replace. */
1024 - s->here->e_value_size =
1025 - cpu_to_le32(i->value_len);
1026 - if (i->value == EXT4_ZERO_XATTR_VALUE) {
1027 - memset(val, 0, size);
1029 - /* Clear pad bytes first. */
1030 - memset(val + size - EXT4_XATTR_PAD, 0,
1032 - memcpy(val, i->value, i->value_len);
1037 + * Getting access to old and new ea inodes is subject to failures.
1038 + * Finish that work before doing any modifications to the xattr data.
1040 + if (!s->not_found && here->e_value_inum) {
1041 + ret = ext4_xattr_inode_iget(inode,
1042 + le32_to_cpu(here->e_value_inum),
1045 + old_ea_inode = NULL;
1049 + if (i->value && in_inode) {
1050 + WARN_ON_ONCE(!i->value_len);
1052 - /* Remove the old value. */
1053 - memmove(first_val + size, first_val, val - first_val);
1054 - memset(first_val, 0, size);
1055 - s->here->e_value_size = 0;
1056 - s->here->e_value_offs = 0;
1059 - /* Adjust all value offsets. */
1061 - while (!IS_LAST_ENTRY(last)) {
1062 - size_t o = le16_to_cpu(last->e_value_offs);
1063 - if (!last->e_value_inum &&
1064 - last->e_value_size && o < offs)
1065 - last->e_value_offs =
1066 - cpu_to_le16(o + size);
1067 - last = EXT4_XATTR_NEXT(last);
1069 + ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
1073 + ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
1077 + new_ea_inode = NULL;
1078 + ext4_xattr_inode_free_quota(inode, i->value_len);
1081 - if (s->here->e_value_inum) {
1082 - ext4_xattr_inode_unlink(inode,
1083 - le32_to_cpu(s->here->e_value_inum));
1084 - s->here->e_value_inum = 0;
1087 + if (old_ea_inode) {
1088 + /* We are ready to release ref count on the old_ea_inode. */
1089 + ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
1091 + /* Release newly required ref count on new_ea_inode. */
1092 + if (new_ea_inode) {
1095 + err = ext4_xattr_inode_dec_ref(handle,
1098 + ext4_warning_inode(new_ea_inode,
1099 + "dec ref new_ea_inode err=%d",
1101 + ext4_xattr_inode_free_quota(inode,
1107 - /* Remove the old name. */
1108 - size_t size = EXT4_XATTR_LEN(name_len);
1109 - last = ENTRY((void *)last - size);
1110 - memmove(s->here, (void *)s->here + size,
1111 - (void *)last - (void *)s->here + sizeof(__u32));
1112 - memset(last, 0, size);
1114 + ext4_xattr_inode_free_quota(inode,
1115 + le32_to_cpu(here->e_value_size));
1118 + /* No failures allowed past this point. */
1120 + if (!s->not_found && here->e_value_offs) {
1121 + /* Remove the old value. */
1122 + void *first_val = s->base + min_offs;
1123 + size_t offs = le16_to_cpu(here->e_value_offs);
1124 + void *val = s->base + offs;
1126 + memmove(first_val + old_size, first_val, val - first_val);
1127 + memset(first_val, 0, old_size);
1128 + min_offs += old_size;
1130 + /* Adjust all value offsets. */
1132 + while (!IS_LAST_ENTRY(last)) {
1133 + size_t o = le16_to_cpu(last->e_value_offs);
1135 + if (!last->e_value_inum &&
1136 + last->e_value_size && o < offs)
1137 + last->e_value_offs = cpu_to_le16(o + old_size);
1138 + last = EXT4_XATTR_NEXT(last);
1143 + /* Remove old name. */
1144 + size_t size = EXT4_XATTR_LEN(name_len);
1146 + last = ENTRY((void *)last - size);
1147 + memmove(here, (void *)here + size,
1148 + (void *)last - (void *)here + sizeof(__u32));
1149 + memset(last, 0, size);
1150 + } else if (s->not_found) {
1151 + /* Insert new name. */
1152 + size_t size = EXT4_XATTR_LEN(name_len);
1153 + size_t rest = (void *)last - (void *)here + sizeof(__u32);
1155 + memmove((void *)here + size, here, rest);
1156 + memset(here, 0, size);
1157 + here->e_name_index = i->name_index;
1158 + here->e_name_len = name_len;
1159 + memcpy(here->e_name, i->name, name_len);
1161 + /* This is an update, reset value info. */
1162 + here->e_value_inum = 0;
1163 + here->e_value_offs = 0;
1164 + here->e_value_size = 0;
1168 - /* Insert the new value. */
1169 + /* Insert new value. */
1171 - unsigned long ea_ino =
1172 - le32_to_cpu(s->here->e_value_inum);
1173 - rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
1174 - i->value, i->value_len);
1177 - s->here->e_value_inum = cpu_to_le32(ea_ino);
1178 - s->here->e_value_offs = 0;
1179 + here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
1180 } else if (i->value_len) {
1181 - size_t size = EXT4_XATTR_SIZE(i->value_len);
1182 - void *val = s->base + min_offs - size;
1183 - s->here->e_value_offs = cpu_to_le16(min_offs - size);
1184 - s->here->e_value_inum = 0;
1185 + void *val = s->base + min_offs - new_size;
1187 + here->e_value_offs = cpu_to_le16(min_offs - new_size);
1188 if (i->value == EXT4_ZERO_XATTR_VALUE) {
1189 - memset(val, 0, size);
1190 + memset(val, 0, new_size);
1192 - /* Clear the pad bytes first. */
1193 - memset(val + size - EXT4_XATTR_PAD, 0,
1195 memcpy(val, i->value, i->value_len);
1196 + /* Clear padding bytes. */
1197 + memset(val + i->value_len, 0,
1198 + new_size - i->value_len);
1201 - s->here->e_value_size = cpu_to_le32(i->value_len);
1202 + here->e_value_size = cpu_to_le32(i->value_len);
1208 + iput(old_ea_inode);
1209 + iput(new_ea_inode);
1213 struct ext4_xattr_block_find {
1214 @@ -1223,6 +1641,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1215 struct mb_cache_entry *ce = NULL;
1217 struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
1218 + struct inode *ea_inode = NULL;
1219 + size_t old_ea_inode_size = 0;
1221 #define header(x) ((struct ext4_xattr_header *)(x))
1223 @@ -1277,6 +1697,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1224 header(s->base)->h_refcount = cpu_to_le32(1);
1225 s->here = ENTRY(s->base + offset);
1226 s->end = s->base + bs->bh->b_size;
1229 + * If existing entry points to an xattr inode, we need
1230 + * to prevent ext4_xattr_set_entry() from decrementing
1231 + * ref count on it because the reference belongs to the
1232 + * original block. In this case, make the entry look
1233 + * like it has an empty value.
1235 + if (!s->not_found && s->here->e_value_inum) {
1237 + * Defer quota free call for previous inode
1238 + * until success is guaranteed.
1240 + old_ea_inode_size = le32_to_cpu(
1241 + s->here->e_value_size);
1242 + s->here->e_value_inum = 0;
1243 + s->here->e_value_size = 0;
1247 /* Allocate a buffer where we construct the new block. */
1248 @@ -1298,6 +1736,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1253 + if (i->value && s->here->e_value_inum) {
1254 + unsigned int ea_ino;
1257 + * A ref count on ea_inode has been taken as part of the call to
1258 + * ext4_xattr_set_entry() above. We would like to drop this
1259 + * extra ref but we have to wait until the xattr block is
1260 + * initialized and has its own ref count on the ea_inode.
1262 + ea_ino = le32_to_cpu(s->here->e_value_inum);
1263 + error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
1270 if (!IS_LAST_ENTRY(s->first))
1271 ext4_xattr_rehash(header(s->base), s->here);
1273 @@ -1408,6 +1864,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1274 EXT4_FREE_BLOCKS_METADATA);
1277 + error = ext4_xattr_inode_inc_ref_all(handle, inode,
1278 + ENTRY(header(s->base)+1));
1280 + goto getblk_failed;
1282 + /* Drop the extra ref on ea_inode. */
1283 + error = ext4_xattr_inode_dec_ref(handle,
1286 + ext4_warning_inode(ea_inode,
1287 + "dec ref error=%d",
1293 lock_buffer(new_bh);
1294 error = ext4_journal_get_create_access(handle, new_bh);
1296 @@ -1427,15 +1899,38 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1300 + if (old_ea_inode_size)
1301 + ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
1303 /* Update the inode. */
1304 EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
1306 /* Drop the previous xattr block. */
1307 - if (bs->bh && bs->bh != new_bh)
1308 - ext4_xattr_release_block(handle, inode, bs->bh);
1309 + if (bs->bh && bs->bh != new_bh) {
1310 + struct ext4_xattr_inode_array *ea_inode_array = NULL;
1312 + ext4_xattr_release_block(handle, inode, bs->bh,
1314 + 0 /* extra_credits */);
1315 + ext4_xattr_inode_array_free(ea_inode_array);
1323 + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
1325 + ext4_warning_inode(ea_inode, "dec ref error=%d",
1328 + /* If there was an error, revert the quota charge. */
1330 + ext4_xattr_inode_free_quota(inode,
1331 + i_size_read(ea_inode));
1335 mb_cache_entry_put(ea_block_cache, ce);
1337 @@ -1560,6 +2055,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
1338 return !memcmp(value, i->value, i->value_len);
1341 +static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
1343 + struct buffer_head *bh;
1346 + if (!EXT4_I(inode)->i_file_acl)
1348 + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1350 + return ERR_PTR(-EIO);
1351 + error = ext4_xattr_check_block(inode, bh);
1353 + return ERR_PTR(error);
1358 * ext4_xattr_set_handle()
1360 @@ -1602,9 +2113,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1362 /* Check journal credits under write lock. */
1363 if (ext4_handle_valid(handle)) {
1364 + struct buffer_head *bh;
1367 - credits = ext4_xattr_set_credits(inode, value_len);
1368 + bh = ext4_xattr_get_block(inode);
1370 + error = PTR_ERR(bh);
1374 + credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
1377 if (!ext4_handle_has_enough_credits(handle, credits)) {
1380 @@ -1640,6 +2160,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1381 if (flags & XATTR_CREATE)
1386 if (!is.s.not_found)
1387 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
1388 @@ -1708,34 +2229,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1392 -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
1393 +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
1395 - struct super_block *sb = inode->i_sb;
1398 - if (!EXT4_SB(sb)->s_journal)
1400 + struct buffer_head *bh;
1403 - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
1407 - * In case of inline data, we may push out the data to a block,
1408 - * so we need to reserve credits for this eventuality
1410 - if (ext4_has_inline_data(inode))
1411 - credits += ext4_writepage_trans_blocks(inode) + 1;
1413 - if (ext4_has_feature_ea_inode(sb)) {
1414 - int nrblocks = (value_len + sb->s_blocksize - 1) >>
1415 - sb->s_blocksize_bits;
1416 + if (!EXT4_SB(inode->i_sb)->s_journal)
1419 - /* For new inode */
1420 - credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
1421 + down_read(&EXT4_I(inode)->xattr_sem);
1423 - /* For data blocks of EA inode */
1424 - credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
1425 + bh = ext4_xattr_get_block(inode);
1427 + err = PTR_ERR(bh);
1429 + *credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
1435 + up_read(&EXT4_I(inode)->xattr_sem);
1440 @@ -1760,7 +2276,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1444 - credits = ext4_xattr_set_credits(inode, value_len);
1445 + error = ext4_xattr_set_credits(inode, value_len, &credits);
1449 handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
1450 if (IS_ERR(handle)) {
1451 error = PTR_ERR(handle);
1452 @@ -2066,10 +2585,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1457 #define EIA_INCR 16 /* must be 2^n */
1458 #define EIA_MASK (EIA_INCR - 1)
1459 -/* Add the large xattr @inode into @ea_inode_array for later deletion.
1461 +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
1462 * If @ea_inode_array is new or full it will be grown and the old
1463 * contents copied over.
1465 @@ -2114,21 +2633,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
1466 * ext4_xattr_delete_inode()
1468 * Free extended attribute resources associated with this inode. Traverse
1469 - * all entries and unlink any xattr inodes associated with this inode. This
1470 - * is called immediately before an inode is freed. We have exclusive
1471 - * access to the inode. If an orphan inode is deleted it will also delete any
1472 - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
1473 - * to ensure they belong to the parent inode and were not deleted already.
1474 + * all entries and decrement reference on any xattr inodes associated with this
1475 + * inode. This is called immediately before an inode is freed. We have exclusive
1476 + * access to the inode. If an orphan inode is deleted it will also release its
1477 + * references on xattr block and xattr inodes.
1480 -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1481 - struct ext4_xattr_inode_array **ea_inode_array,
1482 - int extra_credits)
1483 +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1484 + struct ext4_xattr_inode_array **ea_inode_array,
1485 + int extra_credits)
1487 struct buffer_head *bh = NULL;
1488 struct ext4_xattr_ibody_header *header;
1489 - struct ext4_inode *raw_inode;
1490 struct ext4_iloc iloc = { .bh = NULL };
1491 + struct ext4_xattr_entry *entry;
1494 error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
1495 @@ -2140,66 +2657,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1499 - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
1500 - goto delete_external_ea;
1501 + if (ext4_has_feature_ea_inode(inode->i_sb) &&
1502 + ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
1504 - error = ext4_get_inode_loc(inode, &iloc);
1508 - error = ext4_journal_get_write_access(handle, iloc.bh);
1511 + error = ext4_get_inode_loc(inode, &iloc);
1513 + EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
1517 - raw_inode = ext4_raw_inode(&iloc);
1518 - header = IHDR(inode, raw_inode);
1519 - ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
1520 - false /* block_csum */, ea_inode_array,
1522 + error = ext4_journal_get_write_access(handle, iloc.bh);
1524 + EXT4_ERROR_INODE(inode, "write access (error %d)",
1529 -delete_external_ea:
1530 - if (!EXT4_I(inode)->i_file_acl) {
1534 - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1536 - EXT4_ERROR_INODE(inode, "block %llu read error",
1537 - EXT4_I(inode)->i_file_acl);
1541 - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1542 - BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1543 - EXT4_ERROR_INODE(inode, "bad block %llu",
1544 - EXT4_I(inode)->i_file_acl);
1545 - error = -EFSCORRUPTED;
1547 + header = IHDR(inode, ext4_raw_inode(&iloc));
1548 + if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
1549 + ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
1551 + false /* block_csum */,
1554 + false /* skip_quota */);
1557 - if (ext4_has_feature_ea_inode(inode->i_sb)) {
1558 - error = ext4_journal_get_write_access(handle, bh);
1560 - EXT4_ERROR_INODE(inode, "write access %llu",
1561 + if (EXT4_I(inode)->i_file_acl) {
1562 + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1564 + EXT4_ERROR_INODE(inode, "block %llu read error",
1565 EXT4_I(inode)->i_file_acl);
1569 + error = ext4_xattr_check_block(inode, bh);
1571 + EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
1572 + EXT4_I(inode)->i_file_acl, error);
1575 - ext4_xattr_inode_remove_all(handle, inode, bh,
1577 - true /* block_csum */,
1582 - ext4_xattr_release_block(handle, inode, bh);
1583 - /* Update i_file_acl within the same transaction that releases block. */
1584 - EXT4_I(inode)->i_file_acl = 0;
1585 - error = ext4_mark_inode_dirty(handle, inode);
1587 - EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
1590 + if (ext4_has_feature_ea_inode(inode->i_sb)) {
1591 + for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
1592 + entry = EXT4_XATTR_NEXT(entry))
1593 + if (entry->e_value_inum)
1594 + ext4_xattr_inode_free_quota(inode,
1595 + le32_to_cpu(entry->e_value_size));
1599 + ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
1602 + * Update i_file_acl value in the same transaction that releases
1605 + EXT4_I(inode)->i_file_acl = 0;
1606 + error = ext4_mark_inode_dirty(handle, inode);
1608 + EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
1617 @@ -2208,17 +2730,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1619 void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
1621 - struct inode *ea_inode;
1625 if (ea_inode_array == NULL)
1628 - for (; idx < ea_inode_array->count; ++idx) {
1629 - ea_inode = ea_inode_array->inodes[idx];
1630 - clear_nlink(ea_inode);
1633 + for (idx = 0; idx < ea_inode_array->count; ++idx)
1634 + iput(ea_inode_array->inodes[idx]);
1635 kfree(ea_inode_array);
1638 diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
1639 index b2005a2716d9..67616cb9a059 100644
1640 --- a/fs/ext4/xattr.h
1641 +++ b/fs/ext4/xattr.h
1642 @@ -69,19 +69,6 @@ struct ext4_xattr_entry {
1643 EXT4_I(inode)->i_extra_isize))
1644 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
1647 - * Link EA inode back to parent one using i_mtime field.
1648 - * Extra integer type conversion added to ignore higher
1649 - * bits in i_mtime.tv_sec which might be set by ext4_get()
1651 -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum) \
1653 - (inode)->i_mtime.tv_sec = inum; \
1656 -#define EXT4_XATTR_INODE_GET_PARENT(inode) \
1657 -((__u32)(inode)->i_mtime.tv_sec)
1660 * The minimum size of EA value when you start storing it in an external inode
1661 * size of block - size of header - size of 1 entry - 4 null bytes
1662 @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
1663 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
1664 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
1665 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
1666 -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
1667 +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
1670 -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
1671 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1672 struct ext4_xattr_inode_array **array,
1674 diff --git a/fs/mbcache.c b/fs/mbcache.c
1675 index 45a8d52dc991..d818fd236787 100644
1679 * mb_cache_entry_delete()).
1681 * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
1682 - * They use hash of a block contents as a key and block number as a value.
1683 - * That's why keys need not be unique (different xattr blocks may end up having
1684 - * the same hash). However block number always uniquely identifies a cache
1686 + * Ext4 also uses it for deduplication of xattr values stored in inodes.
1687 + * They use hash of data as a key and provide a value that may represent a
1688 + * block or inode number. That's why keys need not be unique (hash of different
1689 + * data may be the same). However user provided value always uniquely
1690 + * identifies a cache entry.
1692 * We provide functions for creation and removal of entries, search by key,
1693 * and a special "delete entry with given key-value pair" operation. Fixed
1695 2.13.1.611.g7e3b11ae1-goog