1 ext4: Use readahead when reading an inode from the inode table
3 With modern hard drives, reading 64k takes roughly the same time as
4 reading a 4k block. So request readahead for adjacent inode table
5 blocks to reduce the time it takes when iterating over directories
6 (especially when doing this in htree sort order) in a cold cache case.
7 With this patch, the time it takes to run "git status" on a kernel
8 tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches"
11 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
12 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
13 index 163c445..fc7ce2e 100644
16 @@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
17 #define EXT4_DEF_RESUID 0
18 #define EXT4_DEF_RESGID 0
20 +#define EXT4_DEF_INODE_READAHEAD_BITS 5
23 * Default mount options
25 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
26 index f92af01..04e1fd2 100644
27 --- a/fs/ext4/ext4_sb.h
28 +++ b/fs/ext4/ext4_sb.h
29 @@ -52,6 +52,7 @@ struct ext4_sb_info {
30 int s_desc_per_block_bits;
33 + unsigned int s_inode_readahead_bits;
34 spinlock_t s_next_gen_lock;
35 u32 s_next_generation;
37 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
38 index eed1265..5c19604 100644
41 @@ -3833,41 +3833,6 @@ out_stop:
42 ext4_journal_stop(handle);
45 -static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
46 - unsigned long ino, struct ext4_iloc *iloc)
48 - ext4_group_t block_group;
49 - unsigned long offset;
51 - struct ext4_group_desc *gdp;
53 - if (!ext4_valid_inum(sb, ino)) {
55 - * This error is already checked for in namei.c unless we are
56 - * looking at an NFS filehandle, in which case no error
62 - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
63 - gdp = ext4_get_group_desc(sb, block_group, NULL);
68 - * Figure out the offset within the block group inode table
70 - offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
71 - EXT4_INODE_SIZE(sb);
72 - block = ext4_inode_table(sb, gdp) +
73 - (offset >> EXT4_BLOCK_SIZE_BITS(sb));
75 - iloc->block_group = block_group;
76 - iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
81 * ext4_get_inode_loc returns with an extra refcount against the inode's
82 * underlying buffer_head on success. If 'in_mem' is true, we have all
83 @@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
84 static int __ext4_get_inode_loc(struct inode *inode,
85 struct ext4_iloc *iloc, int in_mem)
88 - struct buffer_head *bh;
89 + struct ext4_group_desc *gdp;
90 + struct buffer_head *bh;
91 + struct super_block *sb = inode->i_sb;
93 + int inodes_per_block, inode_offset;
96 + if (!ext4_valid_inum(sb, inode->i_ino))
99 - block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
101 + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
102 + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
106 - bh = sb_getblk(inode->i_sb, block);
108 + * Figure out the offset within the block group inode table
110 + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
111 + inode_offset = ((inode->i_ino - 1) %
112 + EXT4_INODES_PER_GROUP(sb));
113 + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
114 + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
116 + bh = sb_getblk(sb, block);
118 - ext4_error (inode->i_sb, "ext4_get_inode_loc",
119 - "unable to read inode block - "
120 - "inode=%lu, block=%llu",
121 - inode->i_ino, block);
122 + ext4_error(sb, "ext4_get_inode_loc", "unable to read "
123 + "inode block - inode=%lu, block=%llu",
124 + inode->i_ino, block);
127 if (!buffer_uptodate(bh)) {
128 @@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
131 struct buffer_head *bitmap_bh;
132 - struct ext4_group_desc *desc;
133 - int inodes_per_buffer;
134 - int inode_offset, i;
135 - ext4_group_t block_group;
138 - block_group = (inode->i_ino - 1) /
139 - EXT4_INODES_PER_GROUP(inode->i_sb);
140 - inodes_per_buffer = bh->b_size /
141 - EXT4_INODE_SIZE(inode->i_sb);
142 - inode_offset = ((inode->i_ino - 1) %
143 - EXT4_INODES_PER_GROUP(inode->i_sb));
144 - start = inode_offset & ~(inodes_per_buffer - 1);
147 - /* Is the inode bitmap in cache? */
148 - desc = ext4_get_group_desc(inode->i_sb,
149 - block_group, NULL);
152 + start = inode_offset & ~(inodes_per_block - 1);
154 - bitmap_bh = sb_getblk(inode->i_sb,
155 - ext4_inode_bitmap(inode->i_sb, desc));
156 + /* Is the inode bitmap in cache? */
157 + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
161 @@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
165 - for (i = start; i < start + inodes_per_buffer; i++) {
166 + for (i = start; i < start + inodes_per_block; i++) {
167 if (i == inode_offset)
169 if (ext4_test_bit(i, bitmap_bh->b_data))
173 - if (i == start + inodes_per_buffer) {
174 + if (i == start + inodes_per_block) {
175 /* all other inodes are free, so skip I/O */
176 memset(bh->b_data, 0, bh->b_size);
177 set_buffer_uptodate(bh);
178 @@ -3969,6 +3934,31 @@ static int __ext4_get_inode_loc(struct inode *inode,
182 + * If we need to do any I/O, try to readahead up to 16
183 + * blocks from the inode table.
185 + if (EXT4_SB(sb)->s_inode_readahead_bits) {
186 + ext4_fsblk_t b, end, table;
187 + int ra = 1 << EXT4_SB(sb)->s_inode_readahead_bits;
190 + table = ext4_inode_table(sb, gdp);
191 + b = block & ~(ra-1);
195 + num = EXT4_INODES_PER_GROUP(sb);
196 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
197 + EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
198 + num -= le16_to_cpu(gdp->bg_itable_unused);
199 + table += num / inodes_per_block;
203 + sb_breadahead(sb, b++);
207 * There are other valid inodes in the buffer, this inode
208 * has in-inode xattrs, or we don't have this inode in memory.
209 * Read the block from disk.
210 @@ -3978,10 +3968,9 @@ make_io:
211 submit_bh(READ_META, bh);
213 if (!buffer_uptodate(bh)) {
214 - ext4_error(inode->i_sb, "ext4_get_inode_loc",
215 - "unable to read inode block - "
216 - "inode=%lu, block=%llu",
217 - inode->i_ino, block);
218 + ext4_error(sb, "ext4_get_inode_loc",
219 + "unable to read inode block - inode=%lu, "
220 + "block=%llu", inode->i_ino, block);
224 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
225 index 1515006..00c8d97 100644
226 --- a/fs/ext4/super.c
227 +++ b/fs/ext4/super.c
228 @@ -514,8 +514,10 @@ static void ext4_put_super(struct super_block *sb)
229 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
230 ext4_commit_super(sb, es, 1);
234 + remove_proc_entry("inode_readahead_bits", sbi->s_proc);
235 remove_proc_entry(sb->s_id, ext4_proc_root);
238 for (i = 0; i < sbi->s_gdb_count; i++)
239 brelse(sbi->s_group_desc[i]);
240 @@ -778,6 +780,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
241 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
242 seq_puts(seq, ",data=writeback");
244 + if (sbi->s_inode_readahead_bits != EXT4_DEF_INODE_READAHEAD_BITS)
245 + seq_printf(seq, ",inode_readahead_bits=%d",
246 + sbi->s_inode_readahead_bits);
248 ext4_show_quota_options(seq, sb);
251 @@ -912,6 +918,7 @@ enum {
252 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
253 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
254 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
255 + Opt_inode_readahead_bits
258 static match_table_t tokens = {
259 @@ -972,6 +979,7 @@ static match_table_t tokens = {
260 {Opt_resize, "resize"},
261 {Opt_delalloc, "delalloc"},
262 {Opt_nodelalloc, "nodelalloc"},
263 + {Opt_inode_readahead_bits, "inode_readahead_bits=%u"},
267 @@ -1380,6 +1388,13 @@ set_qf_format:
269 set_opt(sbi->s_mount_opt, DELALLOC);
271 + case Opt_inode_readahead_bits:
272 + if (match_int(&args[0], &option))
274 + if (option < 0 || option > 31)
276 + sbi->s_inode_readahead_bits = option;
280 "EXT4-fs: Unrecognized mount option \"%s\" "
281 @@ -1937,6 +1952,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
282 sbi->s_mount_opt = 0;
283 sbi->s_resuid = EXT4_DEF_RESUID;
284 sbi->s_resgid = EXT4_DEF_RESGID;
285 + sbi->s_inode_readahead_bits = EXT4_DEF_INODE_READAHEAD_BITS;
286 sbi->s_sb_block = sb_block;
289 @@ -2233,6 +2249,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
291 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
294 + proc_create_data("inode_readahead_bits", 0644, sbi->s_proc,
295 + &ext4_ui_proc_fops,
296 + &sbi->s_inode_readahead_bits);
298 bgl_lock_init(&sbi->s_blockgroup_lock);
300 for (i = 0; i < db_count; i++) {
301 @@ -2512,8 +2533,10 @@ failed_mount2:
302 brelse(sbi->s_group_desc[i]);
303 kfree(sbi->s_group_desc);
307 + remove_proc_entry("inode_readahead_bits", sbi->s_proc);
308 remove_proc_entry(sb->s_id, ext4_proc_root);
311 for (i = 0; i < MAXQUOTAS; i++)
312 kfree(sbi->s_qf_names[i]);