Add jbd2-abort-instead-of-waiting-for-nonexistent-transaction
[ext4-patch-queue.git] / inode-readahead
blobcd1a10a5a28c245b7b7d549de7281c66732ecba2
1 ext4: Use readahead when reading an inode from the inode table
3 With modern hard drives, reading 64k takes roughly the same time as
4 reading a 4k block.  So request readahead for adjacent inode table
5 blocks to reduce the time it takes when iterating over directories
6 (especially when doing this in htree sort order) in a cold cache case.
7 With this patch, the time it takes to run "git status" on a kernel
8 tree after flushing the caches via "echo 3 > /proc/sys/vm/drop_caches"
9 is reduced by 21%.
11 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
12 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
13 index 163c445..922d187 100644
14 --- a/fs/ext4/ext4.h
15 +++ b/fs/ext4/ext4.h
16 @@ -790,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
17  #define        EXT4_DEF_RESUID         0
18  #define        EXT4_DEF_RESGID         0
20 +#define EXT4_DEF_INODE_READAHEAD_BLKS  32
22  /*
23   * Default mount options
24   */
25 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
26 index f92af01..94e0757 100644
27 --- a/fs/ext4/ext4_sb.h
28 +++ b/fs/ext4/ext4_sb.h
29 @@ -52,6 +52,7 @@ struct ext4_sb_info {
30         int s_desc_per_block_bits;
31         int s_inode_size;
32         int s_first_ino;
33 +       unsigned int s_inode_readahead_blks;
34         spinlock_t s_next_gen_lock;
35         u32 s_next_generation;
36         u32 s_hash_seed[4];
37 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
38 index eed1265..5bd700f 100644
39 --- a/fs/ext4/inode.c
40 +++ b/fs/ext4/inode.c
41 @@ -3833,41 +3833,6 @@ out_stop:
42         ext4_journal_stop(handle);
43  }
45 -static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
46 -               unsigned long ino, struct ext4_iloc *iloc)
48 -       ext4_group_t block_group;
49 -       unsigned long offset;
50 -       ext4_fsblk_t block;
51 -       struct ext4_group_desc *gdp;
53 -       if (!ext4_valid_inum(sb, ino)) {
54 -               /*
55 -                * This error is already checked for in namei.c unless we are
56 -                * looking at an NFS filehandle, in which case no error
57 -                * report is needed
58 -                */
59 -               return 0;
60 -       }
62 -       block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
63 -       gdp = ext4_get_group_desc(sb, block_group, NULL);
64 -       if (!gdp)
65 -               return 0;
67 -       /*
68 -        * Figure out the offset within the block group inode table
69 -        */
70 -       offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
71 -               EXT4_INODE_SIZE(sb);
72 -       block = ext4_inode_table(sb, gdp) +
73 -               (offset >> EXT4_BLOCK_SIZE_BITS(sb));
75 -       iloc->block_group = block_group;
76 -       iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
77 -       return block;
80  /*
81   * ext4_get_inode_loc returns with an extra refcount against the inode's
82   * underlying buffer_head on success. If 'in_mem' is true, we have all
83 @@ -3877,19 +3842,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
84  static int __ext4_get_inode_loc(struct inode *inode,
85                                 struct ext4_iloc *iloc, int in_mem)
86  {
87 -       ext4_fsblk_t block;
88 -       struct buffer_head *bh;
89 +       struct ext4_group_desc  *gdp;
90 +       struct buffer_head      *bh;
91 +       struct super_block      *sb = inode->i_sb;
92 +       ext4_fsblk_t            block;
93 +       int                     inodes_per_block, inode_offset;
95 +       iloc->bh = 0;
96 +       if (!ext4_valid_inum(sb, inode->i_ino))
97 +               return -EIO;
99 -       block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
100 -       if (!block)
101 +       iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
102 +       gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
103 +       if (!gdp)
104                 return -EIO;
106 -       bh = sb_getblk(inode->i_sb, block);
107 +       /*
108 +        * Figure out the offset within the block group inode table
109 +        */
110 +       inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
111 +       inode_offset = ((inode->i_ino - 1) %
112 +                       EXT4_INODES_PER_GROUP(sb));
113 +       block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
114 +       iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
116 +       bh = sb_getblk(sb, block);
117         if (!bh) {
118 -               ext4_error (inode->i_sb, "ext4_get_inode_loc",
119 -                               "unable to read inode block - "
120 -                               "inode=%lu, block=%llu",
121 -                                inode->i_ino, block);
122 +               ext4_error(sb, "ext4_get_inode_loc", "unable to read "
123 +                          "inode block - inode=%lu, block=%llu",
124 +                          inode->i_ino, block);
125                 return -EIO;
126         }
127         if (!buffer_uptodate(bh)) {
128 @@ -3917,28 +3898,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
129                  */
130                 if (in_mem) {
131                         struct buffer_head *bitmap_bh;
132 -                       struct ext4_group_desc *desc;
133 -                       int inodes_per_buffer;
134 -                       int inode_offset, i;
135 -                       ext4_group_t block_group;
136 -                       int start;
138 -                       block_group = (inode->i_ino - 1) /
139 -                                       EXT4_INODES_PER_GROUP(inode->i_sb);
140 -                       inodes_per_buffer = bh->b_size /
141 -                               EXT4_INODE_SIZE(inode->i_sb);
142 -                       inode_offset = ((inode->i_ino - 1) %
143 -                                       EXT4_INODES_PER_GROUP(inode->i_sb));
144 -                       start = inode_offset & ~(inodes_per_buffer - 1);
145 +                       int i, start;
147 -                       /* Is the inode bitmap in cache? */
148 -                       desc = ext4_get_group_desc(inode->i_sb,
149 -                                               block_group, NULL);
150 -                       if (!desc)
151 -                               goto make_io;
152 +                       start = inode_offset & ~(inodes_per_block - 1);
154 -                       bitmap_bh = sb_getblk(inode->i_sb,
155 -                               ext4_inode_bitmap(inode->i_sb, desc));
156 +                       /* Is the inode bitmap in cache? */
157 +                       bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
158                         if (!bitmap_bh)
159                                 goto make_io;
161 @@ -3951,14 +3916,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
162                                 brelse(bitmap_bh);
163                                 goto make_io;
164                         }
165 -                       for (i = start; i < start + inodes_per_buffer; i++) {
166 +                       for (i = start; i < start + inodes_per_block; i++) {
167                                 if (i == inode_offset)
168                                         continue;
169                                 if (ext4_test_bit(i, bitmap_bh->b_data))
170                                         break;
171                         }
172                         brelse(bitmap_bh);
173 -                       if (i == start + inodes_per_buffer) {
174 +                       if (i == start + inodes_per_block) {
175                                 /* all other inodes are free, so skip I/O */
176                                 memset(bh->b_data, 0, bh->b_size);
177                                 set_buffer_uptodate(bh);
178 @@ -3969,6 +3934,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
180  make_io:
181                 /*
182 +                * If we need to do any I/O, try to readahead up to 16
183 +                * blocks from the inode table.
184 +                */
185 +               if (EXT4_SB(sb)->s_inode_readahead_blks) {
186 +                       ext4_fsblk_t b, end, table;
187 +                       unsigned num;
189 +                       table = ext4_inode_table(sb, gdp);
190 +                       /* Make sure s_inode_readahead_blks is a power of 2 */
191 +                       while (EXT4_SB(sb)->s_inode_readahead_blks &
192 +                              (EXT4_SB(sb)->s_inode_readahead_blks-1))
193 +                               EXT4_SB(sb)->s_inode_readahead_blks = 
194 +                                  (EXT4_SB(sb)->s_inode_readahead_blks &
195 +                                   (EXT4_SB(sb)->s_inode_readahead_blks-1));
196 +                       b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
197 +                       if (table > b)
198 +                               b = table;
199 +                       end = b + EXT4_SB(sb)->s_inode_readahead_blks;
200 +                       num = EXT4_INODES_PER_GROUP(sb);
201 +                       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
202 +                                      EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
203 +                               num -= le16_to_cpu(gdp->bg_itable_unused);
204 +                       table += num / inodes_per_block;
205 +                       if (end > table)
206 +                               end = table;
207 +                       while (b <= end)
208 +                               sb_breadahead(sb, b++);
209 +               }
211 +               /*
212                  * There are other valid inodes in the buffer, this inode
213                  * has in-inode xattrs, or we don't have this inode in memory.
214                  * Read the block from disk.
215 @@ -3978,10 +3973,9 @@ make_io:
216                 submit_bh(READ_META, bh);
217                 wait_on_buffer(bh);
218                 if (!buffer_uptodate(bh)) {
219 -                       ext4_error(inode->i_sb, "ext4_get_inode_loc",
220 -                                       "unable to read inode block - "
221 -                                       "inode=%lu, block=%llu",
222 -                                       inode->i_ino, block);
223 +                       ext4_error(sb, "ext4_get_inode_loc",
224 +                                  "unable to read inode block - inode=%lu, "
225 +                                  "block=%llu", inode->i_ino, block);
226                         brelse(bh);
227                         return -EIO;
228                 }
229 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
230 index 6dee26d..9094095 100644
231 --- a/fs/ext4/super.c
232 +++ b/fs/ext4/super.c
233 @@ -514,8 +514,10 @@ static void ext4_put_super(struct super_block *sb)
234                 BUFFER_TRACE(sbi->s_sbh, "marking dirty");
235                 ext4_commit_super(sb, es, 1);
236         }
237 -       if (sbi->s_proc)
238 +       if (sbi->s_proc) {
239 +               remove_proc_entry("inode_readahead_blks", sbi->s_proc);
240                 remove_proc_entry(sb->s_id, ext4_proc_root);
241 +       }
243         for (i = 0; i < sbi->s_gdb_count; i++)
244                 brelse(sbi->s_group_desc[i]);
245 @@ -778,6 +780,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
246         else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
247                 seq_puts(seq, ",data=writeback");
249 +       if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
250 +               seq_printf(seq, ",inode_readahead_blks=%u",
251 +                          sbi->s_inode_readahead_blks);
253         ext4_show_quota_options(seq, sb);
254         return 0;
256 @@ -912,6 +918,7 @@ enum {
257         Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
258         Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
259         Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
260 +       Opt_inode_readahead_blks
261  };
263  static match_table_t tokens = {
264 @@ -972,6 +979,7 @@ static match_table_t tokens = {
265         {Opt_resize, "resize"},
266         {Opt_delalloc, "delalloc"},
267         {Opt_nodelalloc, "nodelalloc"},
268 +       {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
269         {Opt_err, NULL},
270  };
272 @@ -1380,6 +1388,13 @@ set_qf_format:
273                 case Opt_delalloc:
274                         set_opt(sbi->s_mount_opt, DELALLOC);
275                         break;
276 +               case Opt_inode_readahead_blks:
277 +                       if (match_int(&args[0], &option))
278 +                               return 0;
279 +                       if (option < 0 || option > 31)
280 +                               return 0;
281 +                       sbi->s_inode_readahead_blks = option;
282 +                       break;
283                 default:
284                         printk(KERN_ERR
285                                "EXT4-fs: Unrecognized mount option \"%s\" "
286 @@ -1937,6 +1952,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
287         sbi->s_mount_opt = 0;
288         sbi->s_resuid = EXT4_DEF_RESUID;
289         sbi->s_resgid = EXT4_DEF_RESGID;
290 +       sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
291         sbi->s_sb_block = sb_block;
293         unlock_kernel();
294 @@ -2233,6 +2249,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
295         if (ext4_proc_root)
296                 sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
298 +       if (sbi->s_proc)
299 +               proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
300 +                                &ext4_ui_proc_fops,
301 +                                &sbi->s_inode_readahead_blks);
303         bgl_lock_init(&sbi->s_blockgroup_lock);
305         for (i = 0; i < db_count; i++) {
306 @@ -2512,8 +2533,10 @@ failed_mount2:
307                 brelse(sbi->s_group_desc[i]);
308         kfree(sbi->s_group_desc);
309  failed_mount:
310 -       if (sbi->s_proc)
311 +       if (sbi->s_proc) {
312 +               remove_proc_entry("inode_readahead_blks", sbi->s_proc);
313                 remove_proc_entry(sb->s_id, ext4_proc_root);
314 +       }
315  #ifdef CONFIG_QUOTA
316         for (i = 0; i < MAXQUOTAS; i++)
317                 kfree(sbi->s_qf_names[i]);