add patch add-nombcache-mount-option
[ext4-patch-queue.git] / xattr-inode-deduplication
blob7f39a2a3df0d4c2af2805e1ecaff2e322147e801
1 ext4: xattr inode deduplication
3 From: Tahsin Erdogan <tahsin@google.com>
5 Ext4 now supports xattr values that are up to 64k in size (vfs limit).
6 Large xattr values are stored in external inodes each one holding a
7 single value. Once written the data blocks of these inodes are immutable.
9 The real world use cases are expected to have a lot of value duplication
10 such as inherited acls etc. To reduce data duplication on disk, this patch
11 implements a deduplicator that allows sharing of xattr inodes.
13 The deduplication is based on an in-memory hash lookup that is a best
14 effort sharing scheme. When a xattr inode is read from disk (i.e.
15 getxattr() call), its crc32c hash is added to a hash table. Before
16 creating a new xattr inode for a value being set, the hash table is
17 checked to see if an existing inode holds an identical value. If such an
18 inode is found, the ref count on that inode is incremented. On value
19 removal the ref count is decremented and if it reaches zero the inode is
20 deleted.
22 The quota charging for such inodes is manually managed. Every reference
23 holder is charged the full size as if there was no sharing happening.
24 This is consistent with how xattr blocks are also charged.
26 Signed-off-by: Tahsin Erdogan <tahsin@google.com>
27 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
28 ---
29 v6:
30  - Fixed error message "Failed to create an s_ea_inode_cache"
31 v5:
32  - made ext4_meta_trans_blocks() static again since there are no
33    remaining users outside of inode.c
34  - initialize sbi->s_csum_seed when ea_inode feature is enabled
35  - use l_i_version to hold lower 32 bits of the xattr ref count.
36    This avoids clashes with old implementations which use i_mtime.
37    Since l_i_version is not available in HURD_COMPAT mode, fail mount
38    request when both ea_inode feature and HURD_COMPAT are set.
39  - when hash validation fails, fall back to old implementation
40    which has a backref to parent.
41  - fixed checkpatch.pl warning about using unsigned alone
43 v4:
44  - eliminated xattr entry in the xattr inode to avoid complexity and
45    recursion in xattr update path. Now the ref count and hash are stored
46    in i_[c/m/a]time.tv_sec fields.
47  - some clean up in ext4_xattr_set_entry() to reduce code duplication and
48    complexity
50 v3:
51  - use s_csum_seed for hash calculations when available
52  - return error on stored vs calculated hash mismatch
54 v2:
55  - make dependency on crc32c dynamic
56  - update ext4_has_metadata_csum() and ext4_has_group_desc_csum() so that
57    they do not misinterpret existence of EXT4_SB(sb)->s_chksum_driver
59  fs/ext4/acl.c   |    5 +-
60  fs/ext4/ext4.h  |   23 +-
61  fs/ext4/inode.c |   13 +-
62  fs/ext4/super.c |   37 +-
63  fs/ext4/xattr.c | 1038 +++++++++++++++++++++++++++++++++++++++++--------------
64  fs/ext4/xattr.h |   17 +-
65  fs/mbcache.c    |    9 +-
66  7 files changed, 848 insertions(+), 294 deletions(-)
68 diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
69 index 74f7ac539e00..8db03e5c78bc 100644
70 --- a/fs/ext4/acl.c
71 +++ b/fs/ext4/acl.c
72 @@ -238,7 +238,10 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
73         if (error)
74                 return error;
75  retry:
76 -       credits = ext4_xattr_set_credits(inode, acl_size);
77 +       error = ext4_xattr_set_credits(inode, acl_size, &credits);
78 +       if (error)
79 +               return error;
81         handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
82         if (IS_ERR(handle))
83                 return PTR_ERR(handle);
84 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
85 index 3b02bd897b61..dc06287ddec8 100644
86 --- a/fs/ext4/ext4.h
87 +++ b/fs/ext4/ext4.h
88 @@ -1517,6 +1517,7 @@ struct ext4_sb_info {
89         long s_es_nr_inode;
90         struct ext4_es_stats s_es_stats;
91         struct mb_cache *s_ea_block_cache;
92 +       struct mb_cache *s_ea_inode_cache;
93         spinlock_t s_es_lock ____cacheline_aligned_in_smp;
95         /* Ratelimit ext4 messages. */
96 @@ -2099,7 +2100,11 @@ static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
97         return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
98  }
100 -#define ext4_is_quota_file(inode) IS_NOQUOTA(inode)
101 +static inline bool ext4_is_quota_file(struct inode *inode)
103 +       return IS_NOQUOTA(inode) &&
104 +              !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
107  /*
108   * This structure is stuffed into the struct file's private_data field
109 @@ -2482,7 +2487,6 @@ extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
110  extern void ext4_set_inode_flags(struct inode *);
111  extern int ext4_alloc_da_blocks(struct inode *inode);
112  extern void ext4_set_aops(struct inode *inode);
113 -extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
114  extern int ext4_writepage_trans_blocks(struct inode *);
115  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
116  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
117 @@ -2709,19 +2713,20 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
118  extern int ext4_register_li_request(struct super_block *sb,
119                                     ext4_group_t first_not_zeroed);
121 -static inline int ext4_has_group_desc_csum(struct super_block *sb)
123 -       return ext4_has_feature_gdt_csum(sb) ||
124 -              EXT4_SB(sb)->s_chksum_driver != NULL;
127  static inline int ext4_has_metadata_csum(struct super_block *sb)
129         WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
130                      !EXT4_SB(sb)->s_chksum_driver);
132 -       return (EXT4_SB(sb)->s_chksum_driver != NULL);
133 +       return ext4_has_feature_metadata_csum(sb) &&
134 +              (EXT4_SB(sb)->s_chksum_driver != NULL);
137 +static inline int ext4_has_group_desc_csum(struct super_block *sb)
139 +       return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
142  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
144         return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
145 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
146 index cd007f9757d1..ea95bd9eab81 100644
147 --- a/fs/ext4/inode.c
148 +++ b/fs/ext4/inode.c
149 @@ -139,6 +139,8 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
150                                 unsigned int length);
151  static int __ext4_journalled_writepage(struct page *page, unsigned int len);
152  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
153 +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
154 +                                 int pextents);
156  /*
157   * Test whether an inode is a fast symlink.
158 @@ -4843,8 +4845,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
159         }
160         brelse(iloc.bh);
161         ext4_set_inode_flags(inode);
162 -       if (ei->i_flags & EXT4_EA_INODE_FL)
164 +       if (ei->i_flags & EXT4_EA_INODE_FL) {
165                 ext4_xattr_inode_set_class(inode);
167 +               inode_lock(inode);
168 +               inode->i_flags |= S_NOQUOTA;
169 +               inode_unlock(inode);
170 +       }
172         unlock_new_inode(inode);
173         return inode;
175 @@ -5503,7 +5512,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
176   *
177   * Also account for superblock, inode, quota and xattr blocks
178   */
179 -int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
180 +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
181                                   int pextents)
183         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
184 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
185 index 380389740575..d501f8256dc4 100644
186 --- a/fs/ext4/super.c
187 +++ b/fs/ext4/super.c
188 @@ -927,6 +927,10 @@ static void ext4_put_super(struct super_block *sb)
189                 invalidate_bdev(sbi->journal_bdev);
190                 ext4_blkdev_remove(sbi);
191         }
192 +       if (sbi->s_ea_inode_cache) {
193 +               ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
194 +               sbi->s_ea_inode_cache = NULL;
195 +       }
196         if (sbi->s_ea_block_cache) {
197                 ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
198                 sbi->s_ea_block_cache = NULL;
199 @@ -1178,7 +1182,10 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
200         if (res)
201                 return res;
202  retry:
203 -       credits = ext4_xattr_set_credits(inode, len);
204 +       res = ext4_xattr_set_credits(inode, len, &credits);
205 +       if (res)
206 +               return res;
208         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
209         if (IS_ERR(handle))
210                 return PTR_ERR(handle);
211 @@ -3445,7 +3452,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
212         }
214         /* Load the checksum driver */
215 -       if (ext4_has_feature_metadata_csum(sb)) {
216 +       if (ext4_has_feature_metadata_csum(sb) ||
217 +           ext4_has_feature_ea_inode(sb)) {
218                 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
219                 if (IS_ERR(sbi->s_chksum_driver)) {
220                         ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
221 @@ -3467,7 +3475,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
222         /* Precompute checksum seed for all metadata */
223         if (ext4_has_feature_csum_seed(sb))
224                 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
225 -       else if (ext4_has_metadata_csum(sb))
226 +       else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
227                 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
228                                                sizeof(es->s_uuid));
230 @@ -3597,6 +3605,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
231                                  "The Hurd can't support 64-bit file systems");
232                         goto failed_mount;
233                 }
235 +               /*
236 +                * ea_inode feature uses l_i_version field which is not
237 +                * available in HURD_COMPAT mode.
238 +                */
239 +               if (ext4_has_feature_ea_inode(sb)) {
240 +                       ext4_msg(sb, KERN_ERR,
241 +                                "ea_inode feature is not supported for Hurd");
242 +                       goto failed_mount;
243 +               }
244         }
246         if (IS_EXT2_SB(sb)) {
247 @@ -4067,6 +4085,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
248                 goto failed_mount_wq;
249         }
251 +       if (ext4_has_feature_ea_inode(sb)) {
252 +               sbi->s_ea_inode_cache = ext4_xattr_create_cache();
253 +               if (!sbi->s_ea_inode_cache) {
254 +                       ext4_msg(sb, KERN_ERR,
255 +                                "Failed to create ea_inode_cache");
256 +                       goto failed_mount_wq;
257 +               }
258 +       }
260         if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
261             (blocksize != PAGE_SIZE)) {
262                 ext4_msg(sb, KERN_ERR,
263 @@ -4296,6 +4323,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
264         if (EXT4_SB(sb)->rsv_conversion_wq)
265                 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
266  failed_mount_wq:
267 +       if (sbi->s_ea_inode_cache) {
268 +               ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
269 +               sbi->s_ea_inode_cache = NULL;
270 +       }
271         if (sbi->s_ea_block_cache) {
272                 ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
273                 sbi->s_ea_block_cache = NULL;
274 diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
275 index 94f04b9fb421..15c9f736dcc4 100644
276 --- a/fs/ext4/xattr.c
277 +++ b/fs/ext4/xattr.c
278 @@ -108,6 +108,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
279  #define EA_BLOCK_CACHE(inode)  (((struct ext4_sb_info *) \
280                                 inode->i_sb->s_fs_info)->s_ea_block_cache)
282 +#define EA_INODE_CACHE(inode)  (((struct ext4_sb_info *) \
283 +                               inode->i_sb->s_fs_info)->s_ea_inode_cache)
285  static int
286  ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
287                         struct inode *inode);
288 @@ -280,15 +283,44 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
289         return cmp ? -ENODATA : 0;
292 +static u32
293 +ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
295 +       return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
298 +static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
300 +       return ((u64)ea_inode->i_ctime.tv_sec << 32) |
301 +              ((u32)ea_inode->i_version);
304 +static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
306 +       ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
307 +       ea_inode->i_version = (u32)ref_count;
310 +static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
312 +       return (u32)ea_inode->i_atime.tv_sec;
315 +static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
317 +       ea_inode->i_atime.tv_sec = hash;
320  /*
321   * Read the EA value from an inode.
322   */
323  static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
325         unsigned long block = 0;
326 -       struct buffer_head *bh = NULL;
327 +       struct buffer_head *bh;
328         int blocksize = ea_inode->i_sb->s_blocksize;
329         size_t csize, copied = 0;
330 +       void *copy_pos = buf;
332         while (copied < size) {
333                 csize = (size - copied) > blocksize ? blocksize : size - copied;
334 @@ -298,10 +330,10 @@ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
335                 if (!bh)
336                         return -EFSCORRUPTED;
338 -               memcpy(buf, bh->b_data, csize);
339 +               memcpy(copy_pos, bh->b_data, csize);
340                 brelse(bh);
342 -               buf += csize;
343 +               copy_pos += csize;
344                 block += 1;
345                 copied += csize;
346         }
347 @@ -317,29 +349,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
348         inode = ext4_iget(parent->i_sb, ea_ino);
349         if (IS_ERR(inode)) {
350                 err = PTR_ERR(inode);
351 -               ext4_error(parent->i_sb, "error while reading EA inode %lu "
352 -                          "err=%d", ea_ino, err);
353 +               ext4_error(parent->i_sb,
354 +                          "error while reading EA inode %lu err=%d", ea_ino,
355 +                          err);
356                 return err;
357         }
359         if (is_bad_inode(inode)) {
360 -               ext4_error(parent->i_sb, "error while reading EA inode %lu "
361 -                          "is_bad_inode", ea_ino);
362 +               ext4_error(parent->i_sb,
363 +                          "error while reading EA inode %lu is_bad_inode",
364 +                          ea_ino);
365                 err = -EIO;
366                 goto error;
367         }
369 -       if (EXT4_XATTR_INODE_GET_PARENT(inode) != parent->i_ino ||
370 -           inode->i_generation != parent->i_generation) {
371 -               ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
372 -                          "to parent is invalid.", ea_ino);
373 -               err = -EINVAL;
374 -               goto error;
375 -       }
377         if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
378 -               ext4_error(parent->i_sb, "EA inode %lu does not have "
379 -                          "EXT4_EA_INODE_FL flag set.\n", ea_ino);
380 +               ext4_error(parent->i_sb,
381 +                          "EA inode %lu does not have EXT4_EA_INODE_FL flag",
382 +                           ea_ino);
383                 err = -EINVAL;
384                 goto error;
385         }
386 @@ -351,6 +378,20 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
387         return err;
390 +static int
391 +ext4_xattr_inode_verify_hash(struct inode *ea_inode, void *buffer, size_t size)
393 +       u32 hash;
395 +       /* Verify stored hash matches calculated hash. */
396 +       hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
397 +       if (hash != ext4_xattr_inode_get_hash(ea_inode))
398 +               return -EFSCORRUPTED;
399 +       return 0;
402 +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec)
404  /*
405   * Read the value from the EA inode.
406   */
407 @@ -358,17 +399,53 @@ static int
408  ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
409                      size_t size)
411 +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
412         struct inode *ea_inode;
413 -       int ret;
414 +       int err;
416 -       ret = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
417 -       if (ret)
418 -               return ret;
419 +       err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
420 +       if (err) {
421 +               ea_inode = NULL;
422 +               goto out;
423 +       }
425 -       ret = ext4_xattr_inode_read(ea_inode, buffer, size);
426 -       iput(ea_inode);
427 +       if (i_size_read(ea_inode) != size) {
428 +               ext4_warning_inode(ea_inode,
429 +                                  "ea_inode file size=%llu entry size=%zu",
430 +                                  i_size_read(ea_inode), size);
431 +               err = -EFSCORRUPTED;
432 +               goto out;
433 +       }
435 -       return ret;
436 +       err = ext4_xattr_inode_read(ea_inode, buffer, size);
437 +       if (err)
438 +               goto out;
440 +       err = ext4_xattr_inode_verify_hash(ea_inode, buffer, size);
441 +       /*
442 +        * Compatibility check for old Lustre ea_inode implementation. Old
443 +        * version does not have hash validation, but it has a backpointer
444 +        * from ea_inode to the parent inode.
445 +        */
446 +       if (err == -EFSCORRUPTED) {
447 +               if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino ||
448 +                   ea_inode->i_generation != inode->i_generation) {
449 +                       ext4_warning_inode(ea_inode,
450 +                                          "EA inode hash validation failed");
451 +                       goto out;
452 +               }
453 +               /* Do not add ea_inode to the cache. */
454 +               ea_inode_cache = NULL;
455 +       } else if (err)
456 +               goto out;
458 +       if (ea_inode_cache)
459 +               mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
460 +                                     ext4_xattr_inode_get_hash(ea_inode),
461 +                                     ea_inode->i_ino, true /* reusable */);
462 +out:
463 +       iput(ea_inode);
464 +       return err;
467  static int
468 @@ -656,6 +733,101 @@ static void ext4_xattr_update_super_block(handle_t *handle,
469         }
472 +static inline size_t round_up_cluster(struct inode *inode, size_t length)
474 +       struct super_block *sb = inode->i_sb;
475 +       size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
476 +                                   inode->i_blkbits);
477 +       size_t mask = ~(cluster_size - 1);
479 +       return (length + cluster_size - 1) & mask;
482 +static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
484 +       int err;
486 +       err = dquot_alloc_inode(inode);
487 +       if (err)
488 +               return err;
489 +       err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
490 +       if (err)
491 +               dquot_free_inode(inode);
492 +       return err;
495 +static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len)
497 +       dquot_free_space_nodirty(inode, round_up_cluster(inode, len));
498 +       dquot_free_inode(inode);
501 +static int __ext4_xattr_set_credits(struct super_block *sb,
502 +                                   struct buffer_head *block_bh,
503 +                                   size_t value_len)
505 +       int credits;
506 +       int blocks;
508 +       /*
509 +        * 1) Owner inode update
510 +        * 2) Ref count update on old xattr block
511 +        * 3) new xattr block
512 +        * 4) block bitmap update for new xattr block
513 +        * 5) group descriptor for new xattr block
514 +        */
515 +       credits = 5;
517 +       /* We are done if ea_inode feature is not enabled. */
518 +       if (!ext4_has_feature_ea_inode(sb))
519 +               return credits;
521 +       /* New ea_inode, inode map, block bitmap, group descriptor. */
522 +       credits += 4;
524 +       /* Data blocks. */
525 +       blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
527 +       /* Indirection block or one level of extent tree. */
528 +       blocks += 1;
530 +       /* Block bitmap and group descriptor updates for each block. */
531 +       credits += blocks * 2;
533 +       /* Blocks themselves. */
534 +       credits += blocks;
536 +       /* Dereference ea_inode holding old xattr value.
537 +        * Old ea_inode, inode map, block bitmap, group descriptor.
538 +        */
539 +       credits += 4;
541 +       /* Data blocks for old ea_inode. */
542 +       blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;
544 +       /* Indirection block or one level of extent tree for old ea_inode. */
545 +       blocks += 1;
547 +       /* Block bitmap and group descriptor updates for each block. */
548 +       credits += blocks * 2;
550 +       /* Quota updates. */
551 +       credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);
553 +       /* We may need to clone the existing xattr block in which case we need
554 +        * to increment ref counts for existing ea_inodes referenced by it.
555 +        */
556 +       if (block_bh) {
557 +               struct ext4_xattr_entry *entry = BFIRST(block_bh);
559 +               for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
560 +                       if (entry->e_value_inum)
561 +                               /* Ref count update on ea_inode. */
562 +                               credits += 1;
563 +       }
564 +       return credits;
567  static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
568                                      int credits, struct buffer_head *bh,
569                                      bool dirty, bool block_csum)
570 @@ -705,12 +877,140 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
571         return 0;
574 +static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
575 +                                      int ref_change)
577 +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
578 +       struct ext4_iloc iloc;
579 +       s64 ref_count;
580 +       u32 hash;
581 +       int ret;
583 +       inode_lock(ea_inode);
585 +       ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
586 +       if (ret) {
587 +               iloc.bh = NULL;
588 +               goto out;
589 +       }
591 +       ref_count = ext4_xattr_inode_get_ref(ea_inode);
592 +       ref_count += ref_change;
593 +       ext4_xattr_inode_set_ref(ea_inode, ref_count);
595 +       if (ref_change > 0) {
596 +               WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
597 +                         ea_inode->i_ino, ref_count);
599 +               if (ref_count == 1) {
600 +                       WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
601 +                                 ea_inode->i_ino, ea_inode->i_nlink);
603 +                       set_nlink(ea_inode, 1);
604 +                       ext4_orphan_del(handle, ea_inode);
606 +                       hash = ext4_xattr_inode_get_hash(ea_inode);
607 +                       mb_cache_entry_create(ea_inode_cache, GFP_NOFS, hash,
608 +                                             ea_inode->i_ino,
609 +                                             true /* reusable */);
610 +               }
611 +       } else {
612 +               WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
613 +                         ea_inode->i_ino, ref_count);
615 +               if (ref_count == 0) {
616 +                       WARN_ONCE(ea_inode->i_nlink != 1,
617 +                                 "EA inode %lu i_nlink=%u",
618 +                                 ea_inode->i_ino, ea_inode->i_nlink);
620 +                       clear_nlink(ea_inode);
621 +                       ext4_orphan_add(handle, ea_inode);
623 +                       hash = ext4_xattr_inode_get_hash(ea_inode);
624 +                       mb_cache_entry_delete(ea_inode_cache, hash,
625 +                                             ea_inode->i_ino);
626 +               }
627 +       }
629 +       ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
630 +       iloc.bh = NULL;
631 +       if (ret)
632 +               ext4_warning_inode(ea_inode,
633 +                                  "ext4_mark_iloc_dirty() failed ret=%d", ret);
634 +out:
635 +       brelse(iloc.bh);
636 +       inode_unlock(ea_inode);
637 +       return ret;
640 +static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
642 +       return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
645 +static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
647 +       return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
650 +static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
651 +                                       struct ext4_xattr_entry *first)
653 +       struct inode *ea_inode;
654 +       struct ext4_xattr_entry *entry;
655 +       struct ext4_xattr_entry *failed_entry;
656 +       unsigned int ea_ino;
657 +       int err, saved_err;
659 +       for (entry = first; !IS_LAST_ENTRY(entry);
660 +            entry = EXT4_XATTR_NEXT(entry)) {
661 +               if (!entry->e_value_inum)
662 +                       continue;
663 +               ea_ino = le32_to_cpu(entry->e_value_inum);
664 +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
665 +               if (err)
666 +                       goto cleanup;
667 +               err = ext4_xattr_inode_inc_ref(handle, ea_inode);
668 +               if (err) {
669 +                       ext4_warning_inode(ea_inode, "inc ref error %d", err);
670 +                       iput(ea_inode);
671 +                       goto cleanup;
672 +               }
673 +               iput(ea_inode);
674 +       }
675 +       return 0;
677 +cleanup:
678 +       saved_err = err;
679 +       failed_entry = entry;
681 +       for (entry = first; entry != failed_entry;
682 +            entry = EXT4_XATTR_NEXT(entry)) {
683 +               if (!entry->e_value_inum)
684 +                       continue;
685 +               ea_ino = le32_to_cpu(entry->e_value_inum);
686 +               err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode);
687 +               if (err) {
688 +                       ext4_warning(parent->i_sb,
689 +                                    "cleanup ea_ino %u iget error %d", ea_ino,
690 +                                    err);
691 +                       continue;
692 +               }
693 +               err = ext4_xattr_inode_dec_ref(handle, ea_inode);
694 +               if (err)
695 +                       ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
696 +                                          err);
697 +               iput(ea_inode);
698 +       }
699 +       return saved_err;
702  static void
703 -ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
704 -                           struct buffer_head *bh,
705 -                           struct ext4_xattr_entry *first, bool block_csum,
706 -                           struct ext4_xattr_inode_array **ea_inode_array,
707 -                           int extra_credits)
708 +ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
709 +                            struct buffer_head *bh,
710 +                            struct ext4_xattr_entry *first, bool block_csum,
711 +                            struct ext4_xattr_inode_array **ea_inode_array,
712 +                            int extra_credits, bool skip_quota)
714         struct inode *ea_inode;
715         struct ext4_xattr_entry *entry;
716 @@ -747,10 +1047,16 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
717                         continue;
718                 }
720 -               inode_lock(ea_inode);
721 -               clear_nlink(ea_inode);
722 -               ext4_orphan_add(handle, ea_inode);
723 -               inode_unlock(ea_inode);
724 +               err = ext4_xattr_inode_dec_ref(handle, ea_inode);
725 +               if (err) {
726 +                       ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
727 +                                          err);
728 +                       continue;
729 +               }
731 +               if (!skip_quota)
732 +                       ext4_xattr_inode_free_quota(parent,
733 +                                             le32_to_cpu(entry->e_value_size));
735                 /*
736                  * Forget about ea_inode within the same transaction that
737 @@ -784,7 +1090,9 @@ ext4_xattr_inode_remove_all(handle_t *handle, struct inode *parent,
738   */
739  static void
740  ext4_xattr_release_block(handle_t *handle, struct inode *inode,
741 -                        struct buffer_head *bh)
742 +                        struct buffer_head *bh,
743 +                        struct ext4_xattr_inode_array **ea_inode_array,
744 +                        int extra_credits)
746         struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
747         u32 hash, ref;
748 @@ -807,6 +1115,14 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
749                 mb_cache_entry_delete(ea_block_cache, hash, bh->b_blocknr);
750                 get_bh(bh);
751                 unlock_buffer(bh);
753 +               if (ext4_has_feature_ea_inode(inode->i_sb))
754 +                       ext4_xattr_inode_dec_ref_all(handle, inode, bh,
755 +                                                    BFIRST(bh),
756 +                                                    true /* block_csum */,
757 +                                                    ea_inode_array,
758 +                                                    extra_credits,
759 +                                                    true /* skip_quota */);
760                 ext4_free_blocks(handle, inode, bh, 0, 1,
761                                  EXT4_FREE_BLOCKS_METADATA |
762                                  EXT4_FREE_BLOCKS_FORGET);
763 @@ -878,8 +1194,8 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
765         struct buffer_head *bh = NULL;
766         unsigned long block = 0;
767 -       unsigned blocksize = ea_inode->i_sb->s_blocksize;
768 -       unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
769 +       int blocksize = ea_inode->i_sb->s_blocksize;
770 +       int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
771         int csize, wsize = 0;
772         int ret = 0;
773         int retries = 0;
774 @@ -947,7 +1263,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
775   * Create an inode to store the value of a large EA.
776   */
777  static struct inode *ext4_xattr_inode_create(handle_t *handle,
778 -                                            struct inode *inode)
779 +                                            struct inode *inode, u32 hash)
781         struct inode *ea_inode = NULL;
782         uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
783 @@ -965,67 +1281,115 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
784                 ea_inode->i_fop = &ext4_file_operations;
785                 ext4_set_aops(ea_inode);
786                 ext4_xattr_inode_set_class(ea_inode);
787 -               ea_inode->i_generation = inode->i_generation;
788 -               EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
790 -               /*
791 -                * A back-pointer from EA inode to parent inode will be useful
792 -                * for e2fsck.
793 -                */
794 -               EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
795                 unlock_new_inode(ea_inode);
796 -               err = ext4_inode_attach_jinode(ea_inode);
797 +               ext4_xattr_inode_set_ref(ea_inode, 1);
798 +               ext4_xattr_inode_set_hash(ea_inode, hash);
799 +               err = ext4_mark_inode_dirty(handle, ea_inode);
800 +               if (!err)
801 +                       err = ext4_inode_attach_jinode(ea_inode);
802                 if (err) {
803                         iput(ea_inode);
804                         return ERR_PTR(err);
805                 }
807 +               /*
808 +                * Xattr inodes are shared therefore quota charging is performed
809 +                * at a higher level.
810 +                */
811 +               dquot_free_inode(ea_inode);
812 +               dquot_drop(ea_inode);
813 +               inode_lock(ea_inode);
814 +               ea_inode->i_flags |= S_NOQUOTA;
815 +               inode_unlock(ea_inode);
816         }
818         return ea_inode;
822 - * Unlink the inode storing the value of the EA.
823 - */
824 -int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
825 +static struct inode *
826 +ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
827 +                           size_t value_len, u32 hash)
829 -       struct inode *ea_inode = NULL;
830 -       int err;
831 +       struct inode *ea_inode;
832 +       struct mb_cache_entry *ce;
833 +       struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
834 +       void *ea_data;
836 -       err = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
837 -       if (err)
838 -               return err;
839 +       ce = mb_cache_entry_find_first(ea_inode_cache, hash);
840 +       if (!ce)
841 +               return NULL;
843 -       clear_nlink(ea_inode);
844 -       iput(ea_inode);
845 +       ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
846 +       if (!ea_data) {
847 +               mb_cache_entry_put(ea_inode_cache, ce);
848 +               return NULL;
849 +       }
851 -       return 0;
852 +       while (ce) {
853 +               ea_inode = ext4_iget(inode->i_sb, ce->e_value);
854 +               if (!IS_ERR(ea_inode) &&
855 +                   !is_bad_inode(ea_inode) &&
856 +                   (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
857 +                   i_size_read(ea_inode) == value_len &&
858 +                   !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
859 +                   !ext4_xattr_inode_verify_hash(ea_inode, ea_data,
860 +                                                 value_len) &&
861 +                   !memcmp(value, ea_data, value_len)) {
862 +                       mb_cache_entry_touch(ea_inode_cache, ce);
863 +                       mb_cache_entry_put(ea_inode_cache, ce);
864 +                       kvfree(ea_data);
865 +                       return ea_inode;
866 +               }
868 +               if (!IS_ERR(ea_inode))
869 +                       iput(ea_inode);
870 +               ce = mb_cache_entry_find_next(ea_inode_cache, ce);
871 +       }
872 +       kvfree(ea_data);
873 +       return NULL;
876  /*
877   * Add value of the EA in an inode.
878   */
879 -static int ext4_xattr_inode_set(handle_t *handle, struct inode *inode,
880 -                               unsigned long *ea_ino, const void *value,
881 -                               size_t value_len)
882 +static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
883 +                                         const void *value, size_t value_len,
884 +                                         struct inode **ret_inode)
886         struct inode *ea_inode;
887 +       u32 hash;
888         int err;
890 +       hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
891 +       ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
892 +       if (ea_inode) {
893 +               err = ext4_xattr_inode_inc_ref(handle, ea_inode);
894 +               if (err) {
895 +                       iput(ea_inode);
896 +                       return err;
897 +               }
899 +               *ret_inode = ea_inode;
900 +               return 0;
901 +       }
903         /* Create an inode for the EA value */
904 -       ea_inode = ext4_xattr_inode_create(handle, inode);
905 +       ea_inode = ext4_xattr_inode_create(handle, inode, hash);
906         if (IS_ERR(ea_inode))
907                 return PTR_ERR(ea_inode);
909         err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
910 -       if (err)
911 -               clear_nlink(ea_inode);
912 -       else
913 -               *ea_ino = ea_inode->i_ino;
914 +       if (err) {
915 +               ext4_xattr_inode_dec_ref(handle, ea_inode);
916 +               iput(ea_inode);
917 +               return err;
918 +       }
920 -       iput(ea_inode);
921 +       mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
922 +                             ea_inode->i_ino, true /* reusable */);
924 -       return err;
925 +       *ret_inode = ea_inode;
926 +       return 0;
929  static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
930 @@ -1033,9 +1397,37 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
931                                 handle_t *handle, struct inode *inode)
933         struct ext4_xattr_entry *last;
934 -       size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
935 +       struct ext4_xattr_entry *here = s->here;
936 +       size_t min_offs = s->end - s->base, name_len = strlen(i->name);
937         int in_inode = i->in_inode;
938 -       int rc;
939 +       struct inode *old_ea_inode = NULL;
940 +       struct inode *new_ea_inode = NULL;
941 +       size_t old_size, new_size;
942 +       int ret;
944 +       /* Space used by old and new values. */
945 +       old_size = (!s->not_found && !here->e_value_inum) ?
946 +                       EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
947 +       new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;
949 +       /*
950 +        * Optimization for the simple case when old and new values have the
951 +        * same padded sizes. Not applicable if external inodes are involved.
952 +        */
953 +       if (new_size && new_size == old_size) {
954 +               size_t offs = le16_to_cpu(here->e_value_offs);
955 +               void *val = s->base + offs;
957 +               here->e_value_size = cpu_to_le32(i->value_len);
958 +               if (i->value == EXT4_ZERO_XATTR_VALUE) {
959 +                       memset(val, 0, new_size);
960 +               } else {
961 +                       memcpy(val, i->value, i->value_len);
962 +                       /* Clear padding bytes. */
963 +                       memset(val + i->value_len, 0, new_size - i->value_len);
964 +               }
965 +               return 0;
966 +       }
968         /* Compute min_offs and last. */
969         last = s->first;
970 @@ -1046,122 +1438,148 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
971                                 min_offs = offs;
972                 }
973         }
974 -       free = min_offs - ((void *)last - s->base) - sizeof(__u32);
975 -       if (!s->not_found) {
976 -               if (!in_inode &&
977 -                   !s->here->e_value_inum && s->here->e_value_size) {
978 -                       size_t size = le32_to_cpu(s->here->e_value_size);
979 -                       free += EXT4_XATTR_SIZE(size);
980 -               }
981 -               free += EXT4_XATTR_LEN(name_len);
982 -       }
984 +       /* Check whether we have enough space. */
985         if (i->value) {
986 -               size_t value_len = EXT4_XATTR_SIZE(i->value_len);
987 +               size_t free;
989 -               if (in_inode)
990 -                       value_len = 0;
991 +               free = min_offs - ((void *)last - s->base) - sizeof(__u32);
992 +               if (!s->not_found)
993 +                       free += EXT4_XATTR_LEN(name_len) + old_size;
995 -               if (free < EXT4_XATTR_LEN(name_len) + value_len)
996 -                       return -ENOSPC;
997 +               if (free < EXT4_XATTR_LEN(name_len) + new_size) {
998 +                       ret = -ENOSPC;
999 +                       goto out;
1000 +               }
1001         }
1003 -       if (i->value && s->not_found) {
1004 -               /* Insert the new name. */
1005 -               size_t size = EXT4_XATTR_LEN(name_len);
1006 -               size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
1007 -               memmove((void *)s->here + size, s->here, rest);
1008 -               memset(s->here, 0, size);
1009 -               s->here->e_name_index = i->name_index;
1010 -               s->here->e_name_len = name_len;
1011 -               memcpy(s->here->e_name, i->name, name_len);
1012 -       } else {
1013 -               if (!s->here->e_value_inum && s->here->e_value_size &&
1014 -                   s->here->e_value_offs > 0) {
1015 -                       void *first_val = s->base + min_offs;
1016 -                       size_t offs = le16_to_cpu(s->here->e_value_offs);
1017 -                       void *val = s->base + offs;
1018 -                       size_t size = EXT4_XATTR_SIZE(
1019 -                               le32_to_cpu(s->here->e_value_size));
1021 -                       if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
1022 -                               /* The old and the new value have the same
1023 -                                  size. Just replace. */
1024 -                               s->here->e_value_size =
1025 -                                       cpu_to_le32(i->value_len);
1026 -                               if (i->value == EXT4_ZERO_XATTR_VALUE) {
1027 -                                       memset(val, 0, size);
1028 -                               } else {
1029 -                                       /* Clear pad bytes first. */
1030 -                                       memset(val + size - EXT4_XATTR_PAD, 0,
1031 -                                              EXT4_XATTR_PAD);
1032 -                                       memcpy(val, i->value, i->value_len);
1033 -                               }
1034 -                               return 0;
1035 -                       }
1036 +       /*
1037 +        * Getting access to old and new ea inodes is subject to failures.
1038 +        * Finish that work before doing any modifications to the xattr data.
1039 +        */
1040 +       if (!s->not_found && here->e_value_inum) {
1041 +               ret = ext4_xattr_inode_iget(inode,
1042 +                                           le32_to_cpu(here->e_value_inum),
1043 +                                           &old_ea_inode);
1044 +               if (ret) {
1045 +                       old_ea_inode = NULL;
1046 +                       goto out;
1047 +               }
1048 +       }
1049 +       if (i->value && in_inode) {
1050 +               WARN_ON_ONCE(!i->value_len);
1052 -                       /* Remove the old value. */
1053 -                       memmove(first_val + size, first_val, val - first_val);
1054 -                       memset(first_val, 0, size);
1055 -                       s->here->e_value_size = 0;
1056 -                       s->here->e_value_offs = 0;
1057 -                       min_offs += size;
1059 -                       /* Adjust all value offsets. */
1060 -                       last = s->first;
1061 -                       while (!IS_LAST_ENTRY(last)) {
1062 -                               size_t o = le16_to_cpu(last->e_value_offs);
1063 -                               if (!last->e_value_inum &&
1064 -                                   last->e_value_size && o < offs)
1065 -                                       last->e_value_offs =
1066 -                                               cpu_to_le16(o + size);
1067 -                               last = EXT4_XATTR_NEXT(last);
1068 -                       }
1069 +               ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
1070 +               if (ret)
1071 +                       goto out;
1073 +               ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
1074 +                                                    i->value_len,
1075 +                                                    &new_ea_inode);
1076 +               if (ret) {
1077 +                       new_ea_inode = NULL;
1078 +                       ext4_xattr_inode_free_quota(inode, i->value_len);
1079 +                       goto out;
1080                 }
1081 -               if (s->here->e_value_inum) {
1082 -                       ext4_xattr_inode_unlink(inode,
1083 -                                           le32_to_cpu(s->here->e_value_inum));
1084 -                       s->here->e_value_inum = 0;
1085 +       }
1087 +       if (old_ea_inode) {
1088 +               /* We are ready to release ref count on the old_ea_inode. */
1089 +               ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
1090 +               if (ret) {
1091 +                       /* Release newly required ref count on new_ea_inode. */
1092 +                       if (new_ea_inode) {
1093 +                               int err;
1095 +                               err = ext4_xattr_inode_dec_ref(handle,
1096 +                                                              new_ea_inode);
1097 +                               if (err)
1098 +                                       ext4_warning_inode(new_ea_inode,
1099 +                                                 "dec ref new_ea_inode err=%d",
1100 +                                                 err);
1101 +                               ext4_xattr_inode_free_quota(inode,
1102 +                                                           i->value_len);
1103 +                       }
1104 +                       goto out;
1105                 }
1106 -               if (!i->value) {
1107 -                       /* Remove the old name. */
1108 -                       size_t size = EXT4_XATTR_LEN(name_len);
1109 -                       last = ENTRY((void *)last - size);
1110 -                       memmove(s->here, (void *)s->here + size,
1111 -                               (void *)last - (void *)s->here + sizeof(__u32));
1112 -                       memset(last, 0, size);
1114 +               ext4_xattr_inode_free_quota(inode,
1115 +                                           le32_to_cpu(here->e_value_size));
1116 +       }
1118 +       /* No failures allowed past this point. */
1120 +       if (!s->not_found && here->e_value_offs) {
1121 +               /* Remove the old value. */
1122 +               void *first_val = s->base + min_offs;
1123 +               size_t offs = le16_to_cpu(here->e_value_offs);
1124 +               void *val = s->base + offs;
1126 +               memmove(first_val + old_size, first_val, val - first_val);
1127 +               memset(first_val, 0, old_size);
1128 +               min_offs += old_size;
1130 +               /* Adjust all value offsets. */
1131 +               last = s->first;
1132 +               while (!IS_LAST_ENTRY(last)) {
1133 +                       size_t o = le16_to_cpu(last->e_value_offs);
1135 +                       if (!last->e_value_inum &&
1136 +                           last->e_value_size && o < offs)
1137 +                               last->e_value_offs = cpu_to_le16(o + old_size);
1138 +                       last = EXT4_XATTR_NEXT(last);
1139                 }
1140         }
1142 +       if (!i->value) {
1143 +               /* Remove old name. */
1144 +               size_t size = EXT4_XATTR_LEN(name_len);
1146 +               last = ENTRY((void *)last - size);
1147 +               memmove(here, (void *)here + size,
1148 +                       (void *)last - (void *)here + sizeof(__u32));
1149 +               memset(last, 0, size);
1150 +       } else if (s->not_found) {
1151 +               /* Insert new name. */
1152 +               size_t size = EXT4_XATTR_LEN(name_len);
1153 +               size_t rest = (void *)last - (void *)here + sizeof(__u32);
1155 +               memmove((void *)here + size, here, rest);
1156 +               memset(here, 0, size);
1157 +               here->e_name_index = i->name_index;
1158 +               here->e_name_len = name_len;
1159 +               memcpy(here->e_name, i->name, name_len);
1160 +       } else {
1161 +               /* This is an update, reset value info. */
1162 +               here->e_value_inum = 0;
1163 +               here->e_value_offs = 0;
1164 +               here->e_value_size = 0;
1165 +       }
1167         if (i->value) {
1168 -               /* Insert the new value. */
1169 +               /* Insert new value. */
1170                 if (in_inode) {
1171 -                       unsigned long ea_ino =
1172 -                               le32_to_cpu(s->here->e_value_inum);
1173 -                       rc = ext4_xattr_inode_set(handle, inode, &ea_ino,
1174 -                                                 i->value, i->value_len);
1175 -                       if (rc)
1176 -                               goto out;
1177 -                       s->here->e_value_inum = cpu_to_le32(ea_ino);
1178 -                       s->here->e_value_offs = 0;
1179 +                       here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
1180                 } else if (i->value_len) {
1181 -                       size_t size = EXT4_XATTR_SIZE(i->value_len);
1182 -                       void *val = s->base + min_offs - size;
1183 -                       s->here->e_value_offs = cpu_to_le16(min_offs - size);
1184 -                       s->here->e_value_inum = 0;
1185 +                       void *val = s->base + min_offs - new_size;
1187 +                       here->e_value_offs = cpu_to_le16(min_offs - new_size);
1188                         if (i->value == EXT4_ZERO_XATTR_VALUE) {
1189 -                               memset(val, 0, size);
1190 +                               memset(val, 0, new_size);
1191                         } else {
1192 -                               /* Clear the pad bytes first. */
1193 -                               memset(val + size - EXT4_XATTR_PAD, 0,
1194 -                                      EXT4_XATTR_PAD);
1195                                 memcpy(val, i->value, i->value_len);
1196 +                               /* Clear padding bytes. */
1197 +                               memset(val + i->value_len, 0,
1198 +                                      new_size - i->value_len);
1199                         }
1200                 }
1201 -               s->here->e_value_size = cpu_to_le32(i->value_len);
1202 +               here->e_value_size = cpu_to_le32(i->value_len);
1203         }
1205 +       ret = 0;
1206  out:
1207 -       return rc;
1208 +       iput(old_ea_inode);
1209 +       iput(new_ea_inode);
1210 +       return ret;
1213  struct ext4_xattr_block_find {
1214 @@ -1223,6 +1641,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1215         struct mb_cache_entry *ce = NULL;
1216         int error = 0;
1217         struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
1218 +       struct inode *ea_inode = NULL;
1219 +       size_t old_ea_inode_size = 0;
1221  #define header(x) ((struct ext4_xattr_header *)(x))
1223 @@ -1277,6 +1697,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1224                         header(s->base)->h_refcount = cpu_to_le32(1);
1225                         s->here = ENTRY(s->base + offset);
1226                         s->end = s->base + bs->bh->b_size;
1228 +                       /*
1229 +                        * If existing entry points to an xattr inode, we need
1230 +                        * to prevent ext4_xattr_set_entry() from decrementing
1231 +                        * ref count on it because the reference belongs to the
1232 +                        * original block. In this case, make the entry look
1233 +                        * like it has an empty value.
1234 +                        */
1235 +                       if (!s->not_found && s->here->e_value_inum) {
1236 +                               /*
1237 +                                * Defer quota free call for previous inode
1238 +                                * until success is guaranteed.
1239 +                                */
1240 +                               old_ea_inode_size = le32_to_cpu(
1241 +                                                       s->here->e_value_size);
1242 +                               s->here->e_value_inum = 0;
1243 +                               s->here->e_value_size = 0;
1244 +                       }
1245                 }
1246         } else {
1247                 /* Allocate a buffer where we construct the new block. */
1248 @@ -1298,6 +1736,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1249                 goto bad_block;
1250         if (error)
1251                 goto cleanup;
1253 +       if (i->value && s->here->e_value_inum) {
1254 +               unsigned int ea_ino;
1256 +               /*
1257 +                * A ref count on ea_inode has been taken as part of the call to
1258 +                * ext4_xattr_set_entry() above. We would like to drop this
1259 +                * extra ref but we have to wait until the xattr block is
1260 +                * initialized and has its own ref count on the ea_inode.
1261 +                */
1262 +               ea_ino = le32_to_cpu(s->here->e_value_inum);
1263 +               error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode);
1264 +               if (error) {
1265 +                       ea_inode = NULL;
1266 +                       goto cleanup;
1267 +               }
1268 +       }
1270         if (!IS_LAST_ENTRY(s->first))
1271                 ext4_xattr_rehash(header(s->base), s->here);
1273 @@ -1408,6 +1864,22 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1274                                                  EXT4_FREE_BLOCKS_METADATA);
1275                                 goto cleanup;
1276                         }
1277 +                       error = ext4_xattr_inode_inc_ref_all(handle, inode,
1278 +                                                     ENTRY(header(s->base)+1));
1279 +                       if (error)
1280 +                               goto getblk_failed;
1281 +                       if (ea_inode) {
1282 +                               /* Drop the extra ref on ea_inode. */
1283 +                               error = ext4_xattr_inode_dec_ref(handle,
1284 +                                                                ea_inode);
1285 +                               if (error)
1286 +                                       ext4_warning_inode(ea_inode,
1287 +                                                          "dec ref error=%d",
1288 +                                                          error);
1289 +                               iput(ea_inode);
1290 +                               ea_inode = NULL;
1291 +                       }
1293                         lock_buffer(new_bh);
1294                         error = ext4_journal_get_create_access(handle, new_bh);
1295                         if (error) {
1296 @@ -1427,15 +1899,38 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
1297                 }
1298         }
1300 +       if (old_ea_inode_size)
1301 +               ext4_xattr_inode_free_quota(inode, old_ea_inode_size);
1303         /* Update the inode. */
1304         EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
1306         /* Drop the previous xattr block. */
1307 -       if (bs->bh && bs->bh != new_bh)
1308 -               ext4_xattr_release_block(handle, inode, bs->bh);
1309 +       if (bs->bh && bs->bh != new_bh) {
1310 +               struct ext4_xattr_inode_array *ea_inode_array = NULL;
1312 +               ext4_xattr_release_block(handle, inode, bs->bh,
1313 +                                        &ea_inode_array,
1314 +                                        0 /* extra_credits */);
1315 +               ext4_xattr_inode_array_free(ea_inode_array);
1316 +       }
1317         error = 0;
1319  cleanup:
1320 +       if (ea_inode) {
1321 +               int error2;
1323 +               error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
1324 +               if (error2)
1325 +                       ext4_warning_inode(ea_inode, "dec ref error=%d",
1326 +                                          error2);
1328 +               /* If there was an error, revert the quota charge. */
1329 +               if (error)
1330 +                       ext4_xattr_inode_free_quota(inode,
1331 +                                                   i_size_read(ea_inode));
1332 +               iput(ea_inode);
1333 +       }
1334         if (ce)
1335                 mb_cache_entry_put(ea_block_cache, ce);
1336         brelse(new_bh);
1337 @@ -1560,6 +2055,22 @@ static int ext4_xattr_value_same(struct ext4_xattr_search *s,
1338         return !memcmp(value, i->value, i->value_len);
1341 +static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
1343 +       struct buffer_head *bh;
1344 +       int error;
1346 +       if (!EXT4_I(inode)->i_file_acl)
1347 +               return NULL;
1348 +       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1349 +       if (!bh)
1350 +               return ERR_PTR(-EIO);
1351 +       error = ext4_xattr_check_block(inode, bh);
1352 +       if (error)
1353 +               return ERR_PTR(error);
1354 +       return bh;
1357  /*
1358   * ext4_xattr_set_handle()
1359   *
1360 @@ -1602,9 +2113,18 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1362         /* Check journal credits under write lock. */
1363         if (ext4_handle_valid(handle)) {
1364 +               struct buffer_head *bh;
1365                 int credits;
1367 -               credits = ext4_xattr_set_credits(inode, value_len);
1368 +               bh = ext4_xattr_get_block(inode);
1369 +               if (IS_ERR(bh)) {
1370 +                       error = PTR_ERR(bh);
1371 +                       goto cleanup;
1372 +               }
1374 +               credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
1375 +               brelse(bh);
1377                 if (!ext4_handle_has_enough_credits(handle, credits)) {
1378                         error = -ENOSPC;
1379                         goto cleanup;
1380 @@ -1640,6 +2160,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1381                 if (flags & XATTR_CREATE)
1382                         goto cleanup;
1383         }
1385         if (!value) {
1386                 if (!is.s.not_found)
1387                         error = ext4_xattr_ibody_set(handle, inode, &i, &is);
1388 @@ -1708,34 +2229,29 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1389         return error;
1392 -int ext4_xattr_set_credits(struct inode *inode, size_t value_len)
1393 +int ext4_xattr_set_credits(struct inode *inode, size_t value_len, int *credits)
1395 -       struct super_block *sb = inode->i_sb;
1396 -       int credits;
1398 -       if (!EXT4_SB(sb)->s_journal)
1399 -               return 0;
1400 +       struct buffer_head *bh;
1401 +       int err;
1403 -       credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
1404 +       *credits = 0;
1406 -       /*
1407 -        * In case of inline data, we may push out the data to a block,
1408 -        * so we need to reserve credits for this eventuality
1409 -        */
1410 -       if (ext4_has_inline_data(inode))
1411 -               credits += ext4_writepage_trans_blocks(inode) + 1;
1413 -       if (ext4_has_feature_ea_inode(sb)) {
1414 -               int nrblocks = (value_len + sb->s_blocksize - 1) >>
1415 -                                       sb->s_blocksize_bits;
1416 +       if (!EXT4_SB(inode->i_sb)->s_journal)
1417 +               return 0;
1419 -               /* For new inode */
1420 -               credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
1421 +       down_read(&EXT4_I(inode)->xattr_sem);
1423 -               /* For data blocks of EA inode */
1424 -               credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
1425 +       bh = ext4_xattr_get_block(inode);
1426 +       if (IS_ERR(bh)) {
1427 +               err = PTR_ERR(bh);
1428 +       } else {
1429 +               *credits = __ext4_xattr_set_credits(inode->i_sb, bh, value_len);
1430 +               brelse(bh);
1431 +               err = 0;
1432         }
1433 -       return credits;
1435 +       up_read(&EXT4_I(inode)->xattr_sem);
1436 +       return err;
1439  /*
1440 @@ -1760,7 +2276,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1441                 return error;
1443  retry:
1444 -       credits = ext4_xattr_set_credits(inode, value_len);
1445 +       error = ext4_xattr_set_credits(inode, value_len, &credits);
1446 +       if (error)
1447 +               return error;
1449         handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
1450         if (IS_ERR(handle)) {
1451                 error = PTR_ERR(handle);
1452 @@ -2066,10 +2585,10 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1453         return error;
1457  #define EIA_INCR 16 /* must be 2^n */
1458  #define EIA_MASK (EIA_INCR - 1)
1459 -/* Add the large xattr @inode into @ea_inode_array for later deletion.
1461 +/* Add the large xattr @inode into @ea_inode_array for deferred iput().
1462   * If @ea_inode_array is new or full it will be grown and the old
1463   * contents copied over.
1464   */
1465 @@ -2114,21 +2633,19 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
1466   * ext4_xattr_delete_inode()
1467   *
1468   * Free extended attribute resources associated with this inode. Traverse
1469 - * all entries and unlink any xattr inodes associated with this inode. This
1470 - * is called immediately before an inode is freed. We have exclusive
1471 - * access to the inode. If an orphan inode is deleted it will also delete any
1472 - * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
1473 - * to ensure they belong to the parent inode and were not deleted already.
1474 + * all entries and decrement reference on any xattr inodes associated with this
1475 + * inode. This is called immediately before an inode is freed. We have exclusive
1476 + * access to the inode. If an orphan inode is deleted it will also release its
1477 + * references on xattr block and xattr inodes.
1478   */
1479 -int
1480 -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1481 -                       struct ext4_xattr_inode_array **ea_inode_array,
1482 -                       int extra_credits)
1483 +int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1484 +                           struct ext4_xattr_inode_array **ea_inode_array,
1485 +                           int extra_credits)
1487         struct buffer_head *bh = NULL;
1488         struct ext4_xattr_ibody_header *header;
1489 -       struct ext4_inode *raw_inode;
1490         struct ext4_iloc iloc = { .bh = NULL };
1491 +       struct ext4_xattr_entry *entry;
1492         int error;
1494         error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
1495 @@ -2140,66 +2657,71 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1496                 goto cleanup;
1497         }
1499 -       if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
1500 -               goto delete_external_ea;
1501 +       if (ext4_has_feature_ea_inode(inode->i_sb) &&
1502 +           ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
1504 -       error = ext4_get_inode_loc(inode, &iloc);
1505 -       if (error)
1506 -               goto cleanup;
1508 -       error = ext4_journal_get_write_access(handle, iloc.bh);
1509 -       if (error)
1510 -               goto cleanup;
1511 +               error = ext4_get_inode_loc(inode, &iloc);
1512 +               if (error) {
1513 +                       EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
1514 +                       goto cleanup;
1515 +               }
1517 -       raw_inode = ext4_raw_inode(&iloc);
1518 -       header = IHDR(inode, raw_inode);
1519 -       ext4_xattr_inode_remove_all(handle, inode, iloc.bh, IFIRST(header),
1520 -                                   false /* block_csum */, ea_inode_array,
1521 -                                   extra_credits);
1522 +               error = ext4_journal_get_write_access(handle, iloc.bh);
1523 +               if (error) {
1524 +                       EXT4_ERROR_INODE(inode, "write access (error %d)",
1525 +                                        error);
1526 +                       goto cleanup;
1527 +               }
1529 -delete_external_ea:
1530 -       if (!EXT4_I(inode)->i_file_acl) {
1531 -               error = 0;
1532 -               goto cleanup;
1533 -       }
1534 -       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1535 -       if (!bh) {
1536 -               EXT4_ERROR_INODE(inode, "block %llu read error",
1537 -                                EXT4_I(inode)->i_file_acl);
1538 -               error = -EIO;
1539 -               goto cleanup;
1540 -       }
1541 -       if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1542 -           BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1543 -               EXT4_ERROR_INODE(inode, "bad block %llu",
1544 -                                EXT4_I(inode)->i_file_acl);
1545 -               error = -EFSCORRUPTED;
1546 -               goto cleanup;
1547 +               header = IHDR(inode, ext4_raw_inode(&iloc));
1548 +               if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
1549 +                       ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
1550 +                                                    IFIRST(header),
1551 +                                                    false /* block_csum */,
1552 +                                                    ea_inode_array,
1553 +                                                    extra_credits,
1554 +                                                    false /* skip_quota */);
1555         }
1557 -       if (ext4_has_feature_ea_inode(inode->i_sb)) {
1558 -               error = ext4_journal_get_write_access(handle, bh);
1559 -               if (error) {
1560 -                       EXT4_ERROR_INODE(inode, "write access %llu",
1561 +       if (EXT4_I(inode)->i_file_acl) {
1562 +               bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1563 +               if (!bh) {
1564 +                       EXT4_ERROR_INODE(inode, "block %llu read error",
1565                                          EXT4_I(inode)->i_file_acl);
1566 +                       error = -EIO;
1567 +                       goto cleanup;
1568 +               }
1569 +               error = ext4_xattr_check_block(inode, bh);
1570 +               if (error) {
1571 +                       EXT4_ERROR_INODE(inode, "bad block %llu (error %d)",
1572 +                                        EXT4_I(inode)->i_file_acl, error);
1573                         goto cleanup;
1574                 }
1575 -               ext4_xattr_inode_remove_all(handle, inode, bh,
1576 -                                           BFIRST(bh),
1577 -                                           true /* block_csum */,
1578 -                                           ea_inode_array,
1579 -                                           extra_credits);
1580 -       }
1582 -       ext4_xattr_release_block(handle, inode, bh);
1583 -       /* Update i_file_acl within the same transaction that releases block. */
1584 -       EXT4_I(inode)->i_file_acl = 0;
1585 -       error = ext4_mark_inode_dirty(handle, inode);
1586 -       if (error) {
1587 -               EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
1588 -                                error);
1589 -               goto cleanup;
1590 +               if (ext4_has_feature_ea_inode(inode->i_sb)) {
1591 +                       for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
1592 +                            entry = EXT4_XATTR_NEXT(entry))
1593 +                               if (entry->e_value_inum)
1594 +                                       ext4_xattr_inode_free_quota(inode,
1595 +                                             le32_to_cpu(entry->e_value_size));
1597 +               }
1599 +               ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
1600 +                                        extra_credits);
1601 +               /*
1602 +                * Update i_file_acl value in the same transaction that releases
1603 +                * block.
1604 +                */
1605 +               EXT4_I(inode)->i_file_acl = 0;
1606 +               error = ext4_mark_inode_dirty(handle, inode);
1607 +               if (error) {
1608 +                       EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
1609 +                                        error);
1610 +                       goto cleanup;
1611 +               }
1612         }
1613 +       error = 0;
1614  cleanup:
1615         brelse(iloc.bh);
1616         brelse(bh);
1617 @@ -2208,17 +2730,13 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1619  void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
1621 -       struct inode    *ea_inode;
1622 -       int             idx = 0;
1623 +       int idx;
1625         if (ea_inode_array == NULL)
1626                 return;
1628 -       for (; idx < ea_inode_array->count; ++idx) {
1629 -               ea_inode = ea_inode_array->inodes[idx];
1630 -               clear_nlink(ea_inode);
1631 -               iput(ea_inode);
1632 -       }
1633 +       for (idx = 0; idx < ea_inode_array->count; ++idx)
1634 +               iput(ea_inode_array->inodes[idx]);
1635         kfree(ea_inode_array);
1638 diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
1639 index b2005a2716d9..67616cb9a059 100644
1640 --- a/fs/ext4/xattr.h
1641 +++ b/fs/ext4/xattr.h
1642 @@ -69,19 +69,6 @@ struct ext4_xattr_entry {
1643                 EXT4_I(inode)->i_extra_isize))
1644  #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
1647 - * Link EA inode back to parent one using i_mtime field.
1648 - * Extra integer type conversion added to ignore higher
1649 - * bits in i_mtime.tv_sec which might be set by ext4_get()
1650 - */
1651 -#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
1652 -do {                                                  \
1653 -      (inode)->i_mtime.tv_sec = inum;                 \
1654 -} while(0)
1656 -#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
1657 -((__u32)(inode)->i_mtime.tv_sec)
1659  /*
1660   * The minimum size of EA value when you start storing it in an external inode
1661   * size of block - size of header - size of 1 entry - 4 null bytes
1662 @@ -165,9 +152,9 @@ extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
1663  extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
1664  extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
1665  extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
1666 -extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len);
1667 +extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
1668 +                                 int *credits);
1670 -extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
1671  extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
1672                                    struct ext4_xattr_inode_array **array,
1673                                    int extra_credits);
1674 diff --git a/fs/mbcache.c b/fs/mbcache.c
1675 index 45a8d52dc991..d818fd236787 100644
1676 --- a/fs/mbcache.c
1677 +++ b/fs/mbcache.c
1678 @@ -13,10 +13,11 @@
1679   * mb_cache_entry_delete()).
1680   *
1681   * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
1682 - * They use hash of a block contents as a key and block number as a value.
1683 - * That's why keys need not be unique (different xattr blocks may end up having
1684 - * the same hash). However block number always uniquely identifies a cache
1685 - * entry.
1686 + * Ext4 also uses it for deduplication of xattr values stored in inodes.
1687 + * They use hash of data as a key and provide a value that may represent a
1688 + * block or inode number. That's why keys need not be unique (hash of different
1689 + * data may be the same). However user provided value always uniquely
1690 + * identifies a cache entry.
1691   *
1692   * We provide functions for creation and removal of entries, search by key,
1693   * and a special "delete entry with given key-value pair" operation. Fixed
1694 -- 
1695 2.13.1.611.g7e3b11ae1-goog