add patch fix-off-by-one-in-loop-termination-in-ext4_find_unwritten_pgoff
[ext4-patch-queue.git] / add-largedir-feature
blob38daefa27dea8b00c4c8eda4fd46b8bbc7134779
1 ext4: add largedir feature
3 From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
5 This INCOMPAT_LARGEDIR feature allows larger directories to be created
6 in ldiskfs, both with directory sizes over 2GB and and a maximum htree
7 depth of 3 instead of the current limit of 2. These features are needed
8 in order to exceed the current limit of approximately 10M entries in a
9 single directory.
11 Signed-off-by: Yang Sheng <yang.sheng@intel.com>
12 Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
13 ---
14  fs/ext4/ext4.h  |  23 ++++++++---
15  fs/ext4/inode.c |   4 +-
16  fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++-----------------
17  3 files changed, 102 insertions(+), 43 deletions(-)
19 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
20 index 01d52b9..0bbbd9b 100644
21 --- a/fs/ext4/ext4.h
22 +++ b/fs/ext4/ext4.h
23 @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
24                                          EXT4_FEATURE_INCOMPAT_MMP | \
25                                          EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
26                                          EXT4_FEATURE_INCOMPAT_ENCRYPT | \
27 -                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED)
28 +                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
29 +                                        EXT4_FEATURE_INCOMPAT_LARGEDIR)
30  #define EXT4_FEATURE_RO_COMPAT_SUPP    (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
31                                          EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
32                                          EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
33 @@ -2125,6 +2126,16 @@ struct dir_private_info {
34   */
35  #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
37 +/* htree levels for ext4 */
38 +#define        EXT4_HTREE_LEVEL_COMPAT 2
39 +#define        EXT4_HTREE_LEVEL        3
41 +static inline int ext4_dir_htree_level(struct super_block *sb)
43 +       return ext4_has_feature_largedir(sb) ?
44 +               EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
47  /*
48   * Timeout and state flag for lazy initialization inode thread.
49   */
50 @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
51         es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
52  }
54 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
55 +static inline loff_t ext4_isize(struct super_block *sb,
56 +                               struct ext4_inode *raw_inode)
57  {
58 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
59 +       if (ext4_has_feature_largedir(sb) ||
60 +           S_ISREG(le16_to_cpu(raw_inode->i_mode)))
61                 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
62                         le32_to_cpu(raw_inode->i_size_lo);
63 -       else
64 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
66 +       return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
67  }
69  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
70 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
71 index f622d4a..5787f3d 100644
72 --- a/fs/ext4/inode.c
73 +++ b/fs/ext4/inode.c
74 @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
75         if (ext4_has_feature_64bit(sb))
76                 ei->i_file_acl |=
77                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
78 -       inode->i_size = ext4_isize(raw_inode);
79 +       inode->i_size = ext4_isize(sb, raw_inode);
80         if ((size = i_size_read(inode)) < 0) {
81                 EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
82                 ret = -EFSCORRUPTED;
83 @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle,
84                 raw_inode->i_file_acl_high =
85                         cpu_to_le16(ei->i_file_acl >> 32);
86         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
87 -       if (ei->i_disksize != ext4_isize(raw_inode)) {
88 +       if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
89                 ext4_isize_set(raw_inode, ei->i_disksize);
90                 need_datasync = 1;
91         }
92 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
93 index 6ad612c..3298fe3 100644
94 --- a/fs/ext4/namei.c
95 +++ b/fs/ext4/namei.c
96 @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
98  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
99  {
100 -       return le32_to_cpu(entry->block) & 0x00ffffff;
101 +       return le32_to_cpu(entry->block) & 0x0fffffff;
104  static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
105 @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
106         struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
107         u32 hash;
109 +       memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
110         frame->bh = ext4_read_dirblock(dir, 0, INDEX);
111         if (IS_ERR(frame->bh))
112                 return (struct dx_frame *) frame->bh;
113 @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
114         }
116         indirect = root->info.indirect_levels;
117 -       if (indirect > 1) {
118 -               ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
119 -                                  root->info.indirect_levels);
120 +       if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
121 +               ext4_warning(dir->i_sb,
122 +                            "Directory (ino: %lu) htree depth %#06x exceed"
123 +                            "supported value", dir->i_ino,
124 +                            ext4_dir_htree_level(dir->i_sb));
125 +               if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
126 +                       ext4_warning(dir->i_sb, "Enable large directory "
127 +                                               "feature to access it");
128 +               }
129                 goto fail;
130         }
132 @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
134  static void dx_release(struct dx_frame *frames)
136 +       struct dx_root_info *info;
137 +       int i;
139         if (frames[0].bh == NULL)
140                 return;
142 -       if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
143 -               brelse(frames[1].bh);
144 -       brelse(frames[0].bh);
145 +       info = &((struct dx_root *)frames[0].bh->b_data)->info;
146 +       for (i = 0; i <= info->indirect_levels; i++) {
147 +               if (frames[i].bh == NULL)
148 +                       break;
149 +               brelse(frames[i].bh);
150 +               frames[i].bh = NULL;
151 +       }
154  /*
155 @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
157         struct dx_hash_info hinfo;
158         struct ext4_dir_entry_2 *de;
159 -       struct dx_frame frames[2], *frame;
160 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
161         struct inode *dir;
162         ext4_lblk_t block;
163         int count = 0;
164 @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
165                         struct ext4_dir_entry_2 **res_dir)
167         struct super_block * sb = dir->i_sb;
168 -       struct dx_frame frames[2], *frame;
169 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
170         const struct qstr *d_name = fname->usr_fname;
171         struct buffer_head *bh;
172         ext4_lblk_t block;
173 @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
174          */
175         dir->i_mtime = dir->i_ctime = current_time(dir);
176         ext4_update_dx_flag(dir);
177 -       dir->i_version++;
178 +       inode_inc_iversion(dir);
179         ext4_mark_inode_dirty(handle, dir);
180         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
181         err = ext4_handle_dirty_dirent_node(handle, dir, bh);
182 @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
184         struct buffer_head *bh2;
185         struct dx_root  *root;
186 -       struct dx_frame frames[2], *frame;
187 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
188         struct dx_entry *entries;
189         struct ext4_dir_entry_2 *de, *de2;
190         struct ext4_dir_entry_tail *t;
191 @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
192  static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
193                              struct inode *dir, struct inode *inode)
195 -       struct dx_frame frames[2], *frame;
196 +       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
197         struct dx_entry *entries, *at;
198         struct buffer_head *bh;
199         struct super_block *sb = dir->i_sb;
200         struct ext4_dir_entry_2 *de;
201 +       int restart;
202         int err;
204 +again:
205 +       restart = 0;
206         frame = dx_probe(fname, dir, NULL, frames);
207         if (IS_ERR(frame))
208                 return PTR_ERR(frame);
209 @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
210         if (err != -ENOSPC)
211                 goto cleanup;
213 +       err = 0;
214         /* Block full, should compress but for now just split */
215         dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
216                        dx_get_count(entries), dx_get_limit(entries)));
217         /* Need to split index? */
218         if (dx_get_count(entries) == dx_get_limit(entries)) {
219                 ext4_lblk_t newblock;
220 -               unsigned icount = dx_get_count(entries);
221 -               int levels = frame - frames;
222 +               int levels = frame - frames + 1;
223 +               unsigned int icount;
224 +               int add_level = 1;
225                 struct dx_entry *entries2;
226                 struct dx_node *node2;
227                 struct buffer_head *bh2;
229 -               if (levels && (dx_get_count(frames->entries) ==
230 -                              dx_get_limit(frames->entries))) {
231 -                       ext4_warning_inode(dir, "Directory index full!");
232 +               while (frame > frames) {
233 +                       if (dx_get_count((frame - 1)->entries) <
234 +                           dx_get_limit((frame - 1)->entries)) {
235 +                               add_level = 0;
236 +                               break;
237 +                       }
238 +                       frame--; /* split higher index block */
239 +                       at = frame->at;
240 +                       entries = frame->entries;
241 +                       restart = 1;
242 +               }
243 +               if (add_level && levels == ext4_dir_htree_level(sb)) {
244 +                       ext4_warning(sb, "Directory (ino: %lu) index full, "
245 +                                        "reach max htree level :%d",
246 +                                        dir->i_ino, levels);
247 +                       if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
248 +                               ext4_warning(sb, "Large directory feature is "
249 +                                                "not enabled on this "
250 +                                                "filesystem");
251 +                       }
252                         err = -ENOSPC;
253                         goto cleanup;
254                 }
255 +               icount = dx_get_count(entries);
256                 bh2 = ext4_append(handle, dir, &newblock);
257                 if (IS_ERR(bh2)) {
258                         err = PTR_ERR(bh2);
259 @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
260                 err = ext4_journal_get_write_access(handle, frame->bh);
261                 if (err)
262                         goto journal_error;
263 -               if (levels) {
264 +               if (!add_level) {
265                         unsigned icount1 = icount/2, icount2 = icount - icount1;
266                         unsigned hash2 = dx_get_hash(entries + icount1);
267                         dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
268 @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
270                         BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
271                         err = ext4_journal_get_write_access(handle,
272 -                                                            frames[0].bh);
273 +                                                            (frame - 1)->bh);
274                         if (err)
275                                 goto journal_error;
277 @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
278                                 frame->entries = entries = entries2;
279                                 swap(frame->bh, bh2);
280                         }
281 -                       dx_insert_block(frames + 0, hash2, newblock);
282 -                       dxtrace(dx_show_index("node", frames[1].entries));
283 +                       dx_insert_block((frame - 1), hash2, newblock);
284 +                       dxtrace(dx_show_index("node", frame->entries));
285                         dxtrace(dx_show_index("node",
286                                ((struct dx_node *) bh2->b_data)->entries));
287                         err = ext4_handle_dirty_dx_node(handle, dir, bh2);
288                         if (err)
289                                 goto journal_error;
290                         brelse (bh2);
291 +                       ext4_handle_dirty_metadata(handle, dir,
292 +                                                  (frame - 1)->bh);
293 +                       if (restart) {
294 +                               ext4_handle_dirty_metadata(handle, dir,
295 +                                                          frame->bh);
296 +                               goto cleanup;
297 +                       }
298                 } else {
299 -                       dxtrace(printk(KERN_DEBUG
300 -                                      "Creating second level index...\n"));
301 +                       struct dx_root *dxroot;
302                         memcpy((char *) entries2, (char *) entries,
303                                icount * sizeof(struct dx_entry));
304                         dx_set_limit(entries2, dx_node_limit(dir));
305 @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
306                         /* Set up root */
307                         dx_set_count(entries, 1);
308                         dx_set_block(entries + 0, newblock);
309 -                       ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
311 -                       /* Add new access path frame */
312 -                       frame = frames + 1;
313 -                       frame->at = at = at - entries + entries2;
314 -                       frame->entries = entries = entries2;
315 -                       frame->bh = bh2;
316 -                       err = ext4_journal_get_write_access(handle,
317 -                                                            frame->bh);
318 -                       if (err)
319 -                               goto journal_error;
320 +                       dxroot = (struct dx_root *)frames[0].bh->b_data;
321 +                       dxroot->info.indirect_levels += 1;
322 +                       dxtrace(printk(KERN_DEBUG
323 +                                      "Creating %d level index...\n",
324 +                                      info->indirect_levels));
325 +                       ext4_handle_dirty_metadata(handle, dir, frame->bh);
326 +                       ext4_handle_dirty_metadata(handle, dir, bh2);
327 +                       brelse(bh2);
328 +                       restart = 1;
329 +                       goto cleanup;
330                 }
331 -               err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
332                 if (err) {
333                         ext4_std_error(inode->i_sb, err);
334                         goto cleanup;
335 @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
336  cleanup:
337         brelse(bh);
338         dx_release(frames);
339 +       /* @restart is true means htree-path has been changed, we need to
340 +        * repeat dx_probe() to find out valid htree-path
341 +        */
342 +       if (restart && err == 0)
343 +               goto again;
344         return err;
347 @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle,
348                                         blocksize);
349                         else
350                                 de->inode = 0;
351 -                       dir->i_version++;
352 +                       inode_inc_iversion(dir);
353                         return 0;
354                 }
355                 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
356 -- 
357 1.8.3.1