1 ext4: add largedir feature
3 From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
5 This INCOMPAT_LARGEDIR feature allows larger directories to be created
6 in ldiskfs, both with directory sizes over 2GB and and a maximum htree
7 depth of 3 instead of the current limit of 2. These features are needed
8 in order to exceed the current limit of approximately 10M entries in a
11 This patch was originally written by Yang Sheng to support the Lustre server.
13 Signed-off-by: Liang Zhen <liang.zhen@intel.com>
14 Signed-off-by: Yang Sheng <yang.sheng@intel.com>
15 Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
16 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
17 Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
19 fs/ext4/ext4.h | 23 ++++++++++---
20 fs/ext4/inode.c | 4 +--
21 fs/ext4/namei.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
22 3 files changed, 105 insertions(+), 46 deletions(-)
24 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
25 index 32191548abed..f17a4e7075be 100644
28 @@ -1800,7 +1800,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
29 EXT4_FEATURE_INCOMPAT_MMP | \
30 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
31 EXT4_FEATURE_INCOMPAT_ENCRYPT | \
32 - EXT4_FEATURE_INCOMPAT_CSUM_SEED)
33 + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
34 + EXT4_FEATURE_INCOMPAT_LARGEDIR)
35 #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
36 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
37 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
38 @@ -2126,6 +2127,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
40 #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
42 +/* htree levels for ext4 */
43 +#define EXT4_HTREE_LEVEL_COMPAT 2
44 +#define EXT4_HTREE_LEVEL 3
46 +static inline int ext4_dir_htree_level(struct super_block *sb)
48 + return ext4_has_feature_largedir(sb) ?
49 + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
53 * Timeout and state flag for lazy initialization inode thread.
55 @@ -2756,13 +2767,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
56 es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
59 -static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
60 +static inline loff_t ext4_isize(struct super_block *sb,
61 + struct ext4_inode *raw_inode)
63 - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
64 + if (ext4_has_feature_largedir(sb) ||
65 + S_ISREG(le16_to_cpu(raw_inode->i_mode)))
66 return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
67 le32_to_cpu(raw_inode->i_size_lo);
69 - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
71 + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
74 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
75 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
76 index 5cf82d03968c..47604d1352fc 100644
79 @@ -4712,7 +4712,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
80 if (ext4_has_feature_64bit(sb))
82 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
83 - inode->i_size = ext4_isize(raw_inode);
84 + inode->i_size = ext4_isize(sb, raw_inode);
85 if ((size = i_size_read(inode)) < 0) {
86 EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
88 @@ -5037,7 +5037,7 @@ static int ext4_do_update_inode(handle_t *handle,
89 raw_inode->i_file_acl_high =
90 cpu_to_le16(ei->i_file_acl >> 32);
91 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
92 - if (ei->i_disksize != ext4_isize(raw_inode)) {
93 + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
94 ext4_isize_set(raw_inode, ei->i_disksize);
97 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
98 index 404256caf9cf..423e1f761768 100644
100 +++ b/fs/ext4/namei.c
101 @@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
103 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
105 - return le32_to_cpu(entry->block) & 0x00ffffff;
106 + return le32_to_cpu(entry->block) & 0x0fffffff;
109 static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
110 @@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
111 struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
114 + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
115 frame->bh = ext4_read_dirblock(dir, 0, INDEX);
116 if (IS_ERR(frame->bh))
117 return (struct dx_frame *) frame->bh;
118 @@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
121 indirect = root->info.indirect_levels;
122 - if (indirect > 1) {
123 - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
124 - root->info.indirect_levels);
125 + if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
126 + ext4_warning(dir->i_sb,
127 + "Directory (ino: %lu) htree depth %#06x exceed"
128 + "supported value", dir->i_ino,
129 + ext4_dir_htree_level(dir->i_sb));
130 + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
131 + ext4_warning(dir->i_sb, "Enable large directory "
132 + "feature to access it");
137 @@ -859,12 +866,19 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
139 static void dx_release(struct dx_frame *frames)
141 + struct dx_root_info *info;
144 if (frames[0].bh == NULL)
147 - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
148 - brelse(frames[1].bh);
149 - brelse(frames[0].bh);
150 + info = &((struct dx_root *)frames[0].bh->b_data)->info;
151 + for (i = 0; i <= info->indirect_levels; i++) {
152 + if (frames[i].bh == NULL)
154 + brelse(frames[i].bh);
155 + frames[i].bh = NULL;
160 @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
162 struct dx_hash_info hinfo;
163 struct ext4_dir_entry_2 *de;
164 - struct dx_frame frames[2], *frame;
165 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
169 @@ -1485,7 +1499,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
170 struct ext4_dir_entry_2 **res_dir)
172 struct super_block * sb = dir->i_sb;
173 - struct dx_frame frames[2], *frame;
174 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
175 struct buffer_head *bh;
178 @@ -1889,7 +1903,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
180 dir->i_mtime = dir->i_ctime = current_time(dir);
181 ext4_update_dx_flag(dir);
183 + inode_inc_iversion(dir);
184 ext4_mark_inode_dirty(handle, dir);
185 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
186 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
187 @@ -1908,7 +1922,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
189 struct buffer_head *bh2;
190 struct dx_root *root;
191 - struct dx_frame frames[2], *frame;
192 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
193 struct dx_entry *entries;
194 struct ext4_dir_entry_2 *de, *de2;
195 struct ext4_dir_entry_tail *t;
196 @@ -2127,13 +2141,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
197 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
198 struct inode *dir, struct inode *inode)
200 - struct dx_frame frames[2], *frame;
201 + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
202 struct dx_entry *entries, *at;
203 struct buffer_head *bh;
204 struct super_block *sb = dir->i_sb;
205 struct ext4_dir_entry_2 *de;
211 frame = dx_probe(fname, dir, NULL, frames);
213 return PTR_ERR(frame);
214 @@ -2155,24 +2172,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
219 /* Block full, should compress but for now just split */
220 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
221 dx_get_count(entries), dx_get_limit(entries)));
222 /* Need to split index? */
223 if (dx_get_count(entries) == dx_get_limit(entries)) {
224 ext4_lblk_t newblock;
225 - unsigned icount = dx_get_count(entries);
226 - int levels = frame - frames;
227 + int levels = frame - frames + 1;
228 + unsigned int icount;
230 struct dx_entry *entries2;
231 struct dx_node *node2;
232 struct buffer_head *bh2;
234 - if (levels && (dx_get_count(frames->entries) ==
235 - dx_get_limit(frames->entries))) {
236 - ext4_warning_inode(dir, "Directory index full!");
237 + while (frame > frames) {
238 + if (dx_get_count((frame - 1)->entries) <
239 + dx_get_limit((frame - 1)->entries)) {
243 + frame--; /* split higher index block */
245 + entries = frame->entries;
248 + if (add_level && levels == ext4_dir_htree_level(sb)) {
249 + ext4_warning(sb, "Directory (ino: %lu) index full, "
250 + "reach max htree level :%d",
251 + dir->i_ino, levels);
252 + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
253 + ext4_warning(sb, "Large directory feature is "
254 + "not enabled on this "
260 + icount = dx_get_count(entries);
261 bh2 = ext4_append(handle, dir, &newblock);
264 @@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
265 err = ext4_journal_get_write_access(handle, frame->bh);
270 unsigned icount1 = icount/2, icount2 = icount - icount1;
271 unsigned hash2 = dx_get_hash(entries + icount1);
272 dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
273 @@ -2195,7 +2232,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
275 BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
276 err = ext4_journal_get_write_access(handle,
282 @@ -2211,17 +2248,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
283 frame->entries = entries = entries2;
284 swap(frame->bh, bh2);
286 - dx_insert_block(frames + 0, hash2, newblock);
287 - dxtrace(dx_show_index("node", frames[1].entries));
288 + dx_insert_block((frame - 1), hash2, newblock);
289 + dxtrace(dx_show_index("node", frame->entries));
290 dxtrace(dx_show_index("node",
291 ((struct dx_node *) bh2->b_data)->entries));
292 err = ext4_handle_dirty_dx_node(handle, dir, bh2);
296 + err = ext4_handle_dirty_dx_node(handle, dir,
299 + goto journal_error;
301 + err = ext4_handle_dirty_dx_node(handle, dir,
303 + goto journal_error;
306 - dxtrace(printk(KERN_DEBUG
307 - "Creating second level index...\n"));
308 + struct dx_root *dxroot;
309 memcpy((char *) entries2, (char *) entries,
310 icount * sizeof(struct dx_entry));
311 dx_set_limit(entries2, dx_node_limit(dir));
312 @@ -2229,22 +2274,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
314 dx_set_count(entries, 1);
315 dx_set_block(entries + 0, newblock);
316 - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
318 - /* Add new access path frame */
319 - frame = frames + 1;
320 - frame->at = at = at - entries + entries2;
321 - frame->entries = entries = entries2;
323 - err = ext4_journal_get_write_access(handle,
325 + dxroot = (struct dx_root *)frames[0].bh->b_data;
326 + dxroot->info.indirect_levels += 1;
327 + dxtrace(printk(KERN_DEBUG
328 + "Creating %d level index...\n",
329 + info->indirect_levels));
330 + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
334 - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
336 - ext4_std_error(inode->i_sb, err);
338 + err = ext4_handle_dirty_dx_node(handle, dir, bh2);
341 + goto journal_error;
344 de = do_split(handle, dir, &bh, frame, &fname->hinfo);
345 @@ -2256,10 +2297,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
349 - ext4_std_error(dir->i_sb, err);
350 + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
354 + /* @restart is true means htree-path has been changed, we need to
355 + * repeat dx_probe() to find out valid htree-path
357 + if (restart && err == 0)
362 @@ -2296,7 +2342,7 @@ int ext4_generic_delete_entry(handle_t *handle,
367 + inode_inc_iversion(dir);
370 i += ext4_rec_len_from_disk(de->rec_len, blocksize);