add patch fix-ext4_new_inode-journal-credits-calculation
[ext4-patch-queue.git] / add-largedir-feature
blobee0d10e4b44f65f5d4d8ce3ddcfbac1061be362a
1 ext4: add largedir feature
3 From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
5 This INCOMPAT_LARGEDIR feature allows larger directories to be created
6 in ldiskfs, both with directory sizes over 2GB and and a maximum htree
7 depth of 3 instead of the current limit of 2. These features are needed
8 in order to exceed the current limit of approximately 10M entries in a
9 single directory.
11 This patch was originally written by Yang Sheng to support the Lustre server.
13 [ Bumped the credits needed to update an indexed directory -- tytso ]
15 Signed-off-by: Liang Zhen <liang.zhen@intel.com>
16 Signed-off-by: Yang Sheng <yang.sheng@intel.com>
17 Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
18 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
19 Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
20 ---
21  fs/ext4/ext4.h      |  23 ++++++++++---
22  fs/ext4/ext4_jbd2.h |   9 ++++-
23  fs/ext4/inode.c     |   4 +--
24  fs/ext4/namei.c     | 124 ++++++++++++++++++++++++++++++++++++++++++++++---------------------
25  4 files changed, 113 insertions(+), 47 deletions(-)
27 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
28 index 32191548abed..f17a4e7075be 100644
29 --- a/fs/ext4/ext4.h
30 +++ b/fs/ext4/ext4.h
31 @@ -1800,7 +1800,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,              ENCRYPT)
32                                          EXT4_FEATURE_INCOMPAT_MMP | \
33                                          EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
34                                          EXT4_FEATURE_INCOMPAT_ENCRYPT | \
35 -                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED)
36 +                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
37 +                                        EXT4_FEATURE_INCOMPAT_LARGEDIR)
38  #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
39                                          EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
40                                          EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
41 @@ -2126,6 +2127,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
42   */
43  #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
45 +/* htree levels for ext4 */
46 +#define        EXT4_HTREE_LEVEL_COMPAT 2
47 +#define        EXT4_HTREE_LEVEL        3
49 +static inline int ext4_dir_htree_level(struct super_block *sb)
51 +       return ext4_has_feature_largedir(sb) ?
52 +               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
55  /*
56   * Timeout and state flag for lazy initialization inode thread.
57   */
58 @@ -2756,13 +2767,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
59         es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
60  }
62 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
63 +static inline loff_t ext4_isize(struct super_block *sb,
64 +                               struct ext4_inode *raw_inode)
65  {
66 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
67 +       if (ext4_has_feature_largedir(sb) ||
68 +           S_ISREG(le16_to_cpu(raw_inode->i_mode)))
69                 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
70                         le32_to_cpu(raw_inode->i_size_lo);
71 -       else
72 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
74 +       return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
75  }
77  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
78 diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
79 index f97611171023..5e61e464d71c 100644
80 --- a/fs/ext4/ext4_jbd2.h
81 +++ b/fs/ext4/ext4_jbd2.h
82 @@ -77,7 +77,14 @@
84  #define EXT4_RESERVE_TRANS_BLOCKS      12U
86 -#define EXT4_INDEX_EXTRA_TRANS_BLOCKS  8
87 +/*
88 + * Number of credits needed if we need to insert an entry into a
89 + * directory.  For each new index block, we need 4 blocks (old index
90 + * block, new index block, bitmap block, bg summary).  For normal
91 + * htree directories there are 2 levels; if the largedir feature
92 + * enabled it's 3 levels.
93 + */
94 +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS  12U
96  #ifdef CONFIG_QUOTA
97  /* Amount of blocks needed for quota update - we know that the structure was
98 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
99 index 5cf82d03968c..47604d1352fc 100644
100 --- a/fs/ext4/inode.c
101 +++ b/fs/ext4/inode.c
102 @@ -4712,7 +4712,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
103         if (ext4_has_feature_64bit(sb))
104                 ei->i_file_acl |=
105                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
106 -       inode->i_size = ext4_isize(raw_inode);
107 +       inode->i_size = ext4_isize(sb, raw_inode);
108         if ((size = i_size_read(inode)) < 0) {
109                 EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
110                 ret = -EFSCORRUPTED;
111 @@ -5037,7 +5037,7 @@ static int ext4_do_update_inode(handle_t *handle,
112                 raw_inode->i_file_acl_high =
113                         cpu_to_le16(ei->i_file_acl >> 32);
114         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
115 -       if (ei->i_disksize != ext4_isize(raw_inode)) {
116 +       if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
117                 ext4_isize_set(raw_inode, ei->i_disksize);
118                 need_datasync = 1;
119         }
120 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
121 index 404256caf9cf..423e1f761768 100644
122 --- a/fs/ext4/namei.c
123 +++ b/fs/ext4/namei.c
124 @@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
126  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
128 -       return le32_to_cpu(entry->block) & 0x00ffffff;
129 +       return le32_to_cpu(entry->block) & 0x0fffffff;
132  static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
133 @@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
134         struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
135         u32 hash;
137 +       memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
138         frame->bh = ext4_read_dirblock(dir, 0, INDEX);
139         if (IS_ERR(frame->bh))
140                 return (struct dx_frame *) frame->bh;
141 @@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
142         }
144         indirect = root->info.indirect_levels;
145 -       if (indirect > 1) {
146 -               ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
147 -                                  root->info.indirect_levels);
148 +       if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
149 +               ext4_warning(dir->i_sb,
150 +                            "Directory (ino: %lu) htree depth %#06x exceed"
151 +                            "supported value", dir->i_ino,
152 +                            ext4_dir_htree_level(dir->i_sb));
153 +               if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
154 +                       ext4_warning(dir->i_sb, "Enable large directory "
155 +                                               "feature to access it");
156 +               }
157                 goto fail;
158         }
160 @@ -859,12 +866,19 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
162  static void dx_release(struct dx_frame *frames)
164 +       struct dx_root_info *info;
165 +       int i;
167         if (frames[0].bh == NULL)
168                 return;
170 -       if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
171 -               brelse(frames[1].bh);
172 -       brelse(frames[0].bh);
173 +       info = &((struct dx_root *)frames[0].bh->b_data)->info;
174 +       for (i = 0; i <= info->indirect_levels; i++) {
175 +               if (frames[i].bh == NULL)
176 +                       break;
177 +               brelse(frames[i].bh);
178 +               frames[i].bh = NULL;
179 +       }
182  /*
183 @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
185         struct dx_hash_info hinfo;
186         struct ext4_dir_entry_2 *de;
187 -       struct dx_frame frames[2], *frame;
188 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
189         struct inode *dir;
190         ext4_lblk_t block;
191         int count = 0;
192 @@ -1485,7 +1499,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
193                         struct ext4_dir_entry_2 **res_dir)
195         struct super_block * sb = dir->i_sb;
196 -       struct dx_frame frames[2], *frame;
197 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
198         struct buffer_head *bh;
199         ext4_lblk_t block;
200         int retval;
201 @@ -1889,7 +1903,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
202          */
203         dir->i_mtime = dir->i_ctime = current_time(dir);
204         ext4_update_dx_flag(dir);
205 -       dir->i_version++;
206 +       inode_inc_iversion(dir);
207         ext4_mark_inode_dirty(handle, dir);
208         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
209         err = ext4_handle_dirty_dirent_node(handle, dir, bh);
210 @@ -1908,7 +1922,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
212         struct buffer_head *bh2;
213         struct dx_root  *root;
214 -       struct dx_frame frames[2], *frame;
215 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
216         struct dx_entry *entries;
217         struct ext4_dir_entry_2 *de, *de2;
218         struct ext4_dir_entry_tail *t;
219 @@ -2127,13 +2141,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
220  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
221                              struct inode *dir, struct inode *inode)
223 -       struct dx_frame frames[2], *frame;
224 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
225         struct dx_entry *entries, *at;
226         struct buffer_head *bh;
227         struct super_block *sb = dir->i_sb;
228         struct ext4_dir_entry_2 *de;
229 +       int restart;
230         int err;
232 +again:
233 +       restart = 0;
234         frame = dx_probe(fname, dir, NULL, frames);
235         if (IS_ERR(frame))
236                 return PTR_ERR(frame);
237 @@ -2155,24 +2172,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
238         if (err != -ENOSPC)
239                 goto cleanup;
241 +       err = 0;
242         /* Block full, should compress but for now just split */
243         dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
244                        dx_get_count(entries), dx_get_limit(entries)));
245         /* Need to split index? */
246         if (dx_get_count(entries) == dx_get_limit(entries)) {
247                 ext4_lblk_t newblock;
248 -               unsigned icount = dx_get_count(entries);
249 -               int levels = frame - frames;
250 +               int levels = frame - frames + 1;
251 +               unsigned int icount;
252 +               int add_level = 1;
253                 struct dx_entry *entries2;
254                 struct dx_node *node2;
255                 struct buffer_head *bh2;
257 -               if (levels && (dx_get_count(frames->entries) ==
258 -                              dx_get_limit(frames->entries))) {
259 -                       ext4_warning_inode(dir, "Directory index full!");
260 +               while (frame > frames) {
261 +                       if (dx_get_count((frame - 1)->entries) <
262 +                           dx_get_limit((frame - 1)->entries)) {
263 +                               add_level = 0;
264 +                               break;
265 +                       }
266 +                       frame--; /* split higher index block */
267 +                       at = frame->at;
268 +                       entries = frame->entries;
269 +                       restart = 1;
270 +               }
271 +               if (add_level && levels == ext4_dir_htree_level(sb)) {
272 +                       ext4_warning(sb, "Directory (ino: %lu) index full, "
273 +                                        "reach max htree level :%d",
274 +                                        dir->i_ino, levels);
275 +                       if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
276 +                               ext4_warning(sb, "Large directory feature is "
277 +                                                "not enabled on this "
278 +                                                "filesystem");
279 +                       }
280                         err = -ENOSPC;
281                         goto cleanup;
282                 }
283 +               icount = dx_get_count(entries);
284                 bh2 = ext4_append(handle, dir, &newblock);
285                 if (IS_ERR(bh2)) {
286                         err = PTR_ERR(bh2);
287 @@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
288                 err = ext4_journal_get_write_access(handle, frame->bh);
289                 if (err)
290                         goto journal_error;
291 -               if (levels) {
292 +               if (!add_level) {
293                         unsigned icount1 = icount/2, icount2 = icount - icount1;
294                         unsigned hash2 = dx_get_hash(entries + icount1);
295                         dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
296 @@ -2195,7 +2232,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
298                         BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
299                         err = ext4_journal_get_write_access(handle,
300 -                                                            frames[0].bh);
301 +                                                            (frame - 1)->bh);
302                         if (err)
303                                 goto journal_error;
305 @@ -2211,17 +2248,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
306                                 frame->entries = entries = entries2;
307                                 swap(frame->bh, bh2);
308                         }
309 -                       dx_insert_block(frames + 0, hash2, newblock);
310 -                       dxtrace(dx_show_index("node", frames[1].entries));
311 +                       dx_insert_block((frame - 1), hash2, newblock);
312 +                       dxtrace(dx_show_index("node", frame->entries));
313                         dxtrace(dx_show_index("node",
314                                ((struct dx_node *) bh2->b_data)->entries));
315                         err = ext4_handle_dirty_dx_node(handle, dir, bh2);
316                         if (err)
317                                 goto journal_error;
318                         brelse (bh2);
319 +                       err = ext4_handle_dirty_dx_node(handle, dir,
320 +                                                  (frame - 1)->bh);
321 +                       if (err)
322 +                               goto journal_error;
323 +                       if (restart) {
324 +                               err = ext4_handle_dirty_dx_node(handle, dir,
325 +                                                          frame->bh);
326 +                               goto journal_error;
327 +                       }
328                 } else {
329 -                       dxtrace(printk(KERN_DEBUG
330 -                                      "Creating second level index...\n"));
331 +                       struct dx_root *dxroot;
332                         memcpy((char *) entries2, (char *) entries,
333                                icount * sizeof(struct dx_entry));
334                         dx_set_limit(entries2, dx_node_limit(dir));
335 @@ -2229,22 +2274,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
336                         /* Set up root */
337                         dx_set_count(entries, 1);
338                         dx_set_block(entries + 0, newblock);
339 -                       ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
341 -                       /* Add new access path frame */
342 -                       frame = frames + 1;
343 -                       frame->at = at = at - entries + entries2;
344 -                       frame->entries = entries = entries2;
345 -                       frame->bh = bh2;
346 -                       err = ext4_journal_get_write_access(handle,
347 -                                                            frame->bh);
348 +                       dxroot = (struct dx_root *)frames[0].bh->b_data;
349 +                       dxroot->info.indirect_levels += 1;
350 +                       dxtrace(printk(KERN_DEBUG
351 +                                      "Creating %d level index...\n",
352 +                                      info->indirect_levels));
353 +                       err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
354                         if (err)
355                                 goto journal_error;
356 -               }
357 -               err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
358 -               if (err) {
359 -                       ext4_std_error(inode->i_sb, err);
360 -                       goto cleanup;
361 +                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
362 +                       brelse(bh2);
363 +                       restart = 1;
364 +                       goto journal_error;
365                 }
366         }
367         de = do_split(handle, dir, &bh, frame, &fname->hinfo);
368 @@ -2256,10 +2297,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
369         goto cleanup;
371  journal_error:
372 -       ext4_std_error(dir->i_sb, err);
373 +       ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
374  cleanup:
375         brelse(bh);
376         dx_release(frames);
377 +       /* @restart is true means htree-path has been changed, we need to
378 +        * repeat dx_probe() to find out valid htree-path
379 +        */
380 +       if (restart && err == 0)
381 +               goto again;
382         return err;
385 @@ -2296,7 +2342,7 @@ int ext4_generic_delete_entry(handle_t *handle,
386                                         blocksize);
387                         else
388                                 de->inode = 0;
389 -                       dir->i_version++;
390 +                       inode_inc_iversion(dir);
391                         return 0;
392                 }
393                 i += ext4_rec_len_from_disk(de->rec_len, blocksize);