1 ext4: add lazytime mount option
3 Add a new mount option which enables a new "lazytime" mode. This mode
4 causes atime, mtime, and ctime updates to only be made to the
5 in-memory version of the inode. The on-disk times will only get
6 updated when (a) when the inode table block for the inode needs to be
7 updated for some non-time related change involving any inode in the
8 block, (b) if userspace calls fsync(), or (c) the refcount on an
9 undeleted inode goes to zero (in most cases, when the last file
10 descriptor assoicated with the inode is closed).
12 This is legal according to POSIX because there are no guarantees after
13 a crash unless userspace explicitly requests via a fsync(2) call. So
14 in fact, this a better way of reducing the disk traffic resulting from
15 atime is use lazytime instead of relatime or noatime. Enabling
16 lazytime and disabling the default realtime will result in fewer extra
17 disk writes, and has the benefits of being POSIX-compliant --- since
18 either noatime and relatime violates POSIX.
20 The lazytime mode reduces pressure on the journal spinlocks, since
21 time updates resulting from calls to file_update_time() are almost
22 always done using separate jbd2 handles. For allocating writes, the
23 inode will need to be updated anyway when i_blocks change, and so the
24 mtime updates will be folded into jbd2 handle in the ext4 write path.
26 In addition, for workloads feature a large number of random write to a
27 preallocated file, the lazytime mount option significantly reduces
28 writes to the inode table. The repeated 4k writes to a single block
29 will result in undesirable stress on flash devices and SMR disk
30 drives. Even on conventional HDD's, the repeated writes to the inode
31 table block will trigger Adjacent Track Interference (ATI) remediation
32 latencies, which very negatively impact 99.9 percentile latencies ---
33 which is a very big deal for web serving tiers (for example).
35 n.b.: because of the many wins of this mode, we may want to enable
36 lazytime updates by default in the future. If you know of use cases
37 where having inaccurate mtime values after a crash would be extremely
38 problematic, please us know at linux-ext4@vger.kernel.org.
40 Google-Bug-Id: 18297052
42 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
44 fs/ext4/ext4.h | 3 +++
46 fs/ext4/fsync.c | 3 +++
47 fs/ext4/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
48 fs/ext4/namei.c | 2 ++
49 fs/ext4/super.c | 14 ++++++++++++++
50 fs/ext4/symlink.c | 2 ++
51 fs/inode.c | 36 ++++++++++++++++++++++++++++++++++
52 include/linux/fs.h | 2 ++
53 9 files changed, 124 insertions(+), 1 deletion(-)
55 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
56 index c55a1fa..494c504 100644
59 @@ -970,6 +970,7 @@ struct ext4_inode_info {
60 #define EXT4_MOUNT_ERRORS_MASK 0x00070
61 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
62 #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
63 +#define EXT4_MOUNT_LAZYTIME 0x00200 /* Update the time lazily */
64 #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
65 #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
66 #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
67 @@ -1407,6 +1408,7 @@ enum {
68 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
69 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
70 EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
71 + EXT4_STATE_DIRTY_TIME, /* the time needs to be updated */
74 #define EXT4_INODE_BIT_FNS(name, field, offset) \
75 @@ -2114,6 +2116,7 @@ extern int ext4_write_inode(struct inode *, struct writeback_control *);
76 extern int ext4_setattr(struct dentry *, struct iattr *);
77 extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
79 +extern int ext4_update_time(struct inode *, struct timespec *, int);
80 extern void ext4_evict_inode(struct inode *);
81 extern void ext4_clear_inode(struct inode *);
82 extern int ext4_sync_inode(handle_t *, struct inode *);
83 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
84 index 8131be8..2cf6aaf 100644
87 @@ -603,6 +603,7 @@ const struct file_operations ext4_file_operations = {
88 const struct inode_operations ext4_file_inode_operations = {
89 .setattr = ext4_setattr,
90 .getattr = ext4_getattr,
91 + .update_time = ext4_update_time,
92 .setxattr = generic_setxattr,
93 .getxattr = generic_getxattr,
94 .listxattr = ext4_listxattr,
95 diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
96 index a8bc47f..ba05c83 100644
99 @@ -116,6 +116,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
100 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
104 + if (!datasync && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME))
105 + ext4_dirty_inode(inode, 0);
107 * data=writeback,ordered:
108 * The caller's filemap_fdatawrite()/wait will sync the data.
109 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
110 index 3356ab5..1b5e4bd 100644
111 --- a/fs/ext4/inode.c
112 +++ b/fs/ext4/inode.c
113 @@ -4163,6 +4163,46 @@ static int ext4_inode_blocks_set(handle_t *handle,
117 + * Opportunistically update the other time fields for other inodes in
118 + * the same inode table block.
120 +static void ext4_update_other_inodes_time(struct super_block *sb,
121 + unsigned long orig_ino, char *buf)
123 + struct ext4_inode_info *ei;
124 + struct ext4_inode *raw_inode;
126 + struct inode *inode;
127 + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
128 + int inode_size = EXT4_INODE_SIZE(sb);
130 + ino = orig_ino & ~(inodes_per_block - 1);
131 + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
132 + if (ino == orig_ino)
134 + inode = find_active_inode_nowait(sb, ino);
136 + !ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
140 + raw_inode = (struct ext4_inode *) buf;
141 + ei = EXT4_I(inode);
143 + smp_mb__before_spinlock();
144 + spin_lock(&ei->i_raw_lock);
145 + ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
146 + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
147 + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
148 + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
149 + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
150 + ext4_inode_csum_set(inode, raw_inode, ei);
151 + spin_unlock(&ei->i_raw_lock);
157 * Post the struct inode info into an on-disk inode location in the
158 * buffer-cache. This gobbles the caller's reference to the
159 * buffer_head in the inode location struct.
160 @@ -4182,7 +4222,9 @@ static int ext4_do_update_inode(handle_t *handle,
164 + smp_mb__before_spinlock();
165 spin_lock(&ei->i_raw_lock);
166 + ext4_clear_inode_state(inode, EXT4_STATE_DIRTY_TIME);
168 /* For fields not tracked in the in-memory inode,
169 * initialise them to zero for new inodes. */
170 @@ -4273,8 +4315,8 @@ static int ext4_do_update_inode(handle_t *handle,
173 ext4_inode_csum_set(inode, raw_inode, ei);
175 spin_unlock(&ei->i_raw_lock);
176 + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data);
178 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
179 rc = ext4_handle_dirty_metadata(handle, NULL, bh);
180 @@ -4622,6 +4664,24 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
184 +int ext4_update_time(struct inode *inode, struct timespec *time, int flags)
186 + if (flags & S_ATIME)
187 + inode->i_atime = *time;
188 + if (flags & S_VERSION)
189 + inode_inc_iversion(inode);
190 + if (flags & S_CTIME)
191 + inode->i_ctime = *time;
192 + if (flags & S_MTIME)
193 + inode->i_mtime = *time;
194 + if (test_opt(inode->i_sb, LAZYTIME)) {
196 + ext4_set_inode_state(inode, EXT4_STATE_DIRTY_TIME);
198 + mark_inode_dirty_sync(inode);
202 static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
205 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
206 index 4262118..f782040 100644
207 --- a/fs/ext4/namei.c
208 +++ b/fs/ext4/namei.c
209 @@ -3532,6 +3532,7 @@ const struct inode_operations ext4_dir_inode_operations = {
210 .tmpfile = ext4_tmpfile,
211 .rename2 = ext4_rename2,
212 .setattr = ext4_setattr,
213 + .update_time = ext4_update_time,
214 .setxattr = generic_setxattr,
215 .getxattr = generic_getxattr,
216 .listxattr = ext4_listxattr,
217 @@ -3545,6 +3546,7 @@ const struct inode_operations ext4_special_inode_operations = {
218 .setattr = ext4_setattr,
219 .setxattr = generic_setxattr,
220 .getxattr = generic_getxattr,
221 + .update_time = ext4_update_time,
222 .listxattr = ext4_listxattr,
223 .removexattr = generic_removexattr,
224 .get_acl = ext4_get_acl,
225 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
226 index 2c9e686..16c9983 100644
227 --- a/fs/ext4/super.c
228 +++ b/fs/ext4/super.c
229 @@ -910,6 +910,14 @@ static int ext4_drop_inode(struct inode *inode)
230 int drop = generic_drop_inode(inode);
232 trace_ext4_drop_inode(inode, drop);
233 + if (!drop && ext4_test_inode_state(inode, EXT4_STATE_DIRTY_TIME)) {
234 + atomic_inc(&inode->i_count);
235 + spin_unlock(&inode->i_lock);
236 + ext4_dirty_inode(inode, 0);
237 + spin_lock(&inode->i_lock);
238 + if (atomic_dec_and_test(&inode->i_count))
239 + drop = generic_drop_inode(inode);
244 @@ -1142,6 +1150,7 @@ enum {
245 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
246 Opt_usrquota, Opt_grpquota, Opt_i_version,
247 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
248 + Opt_lazytime, Opt_nolazytime,
249 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
250 Opt_inode_readahead_blks, Opt_journal_ioprio,
251 Opt_dioread_nolock, Opt_dioread_lock,
252 @@ -1204,6 +1213,8 @@ static const match_table_t tokens = {
253 {Opt_i_version, "i_version"},
254 {Opt_stripe, "stripe=%u"},
255 {Opt_delalloc, "delalloc"},
256 + {Opt_lazytime, "lazytime"},
257 + {Opt_nolazytime, "nolazytime"},
258 {Opt_nodelalloc, "nodelalloc"},
259 {Opt_removed, "mblk_io_submit"},
260 {Opt_removed, "nomblk_io_submit"},
261 @@ -1361,6 +1372,8 @@ static const struct mount_opts {
262 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
263 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
264 MOPT_EXT4_ONLY | MOPT_CLEAR},
265 + {Opt_lazytime, EXT4_MOUNT_LAZYTIME, MOPT_SET},
266 + {Opt_nolazytime, EXT4_MOUNT_LAZYTIME, MOPT_CLEAR},
267 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
268 MOPT_EXT4_ONLY | MOPT_SET},
269 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
270 @@ -3514,6 +3527,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
272 /* Set defaults before we parse the mount options */
273 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
274 + set_opt(sb, LAZYTIME);
275 set_opt(sb, INIT_INODE_TABLE);
276 if (def_mount_opts & EXT4_DEFM_DEBUG)
278 diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
279 index ff37119..7c92b93 100644
280 --- a/fs/ext4/symlink.c
281 +++ b/fs/ext4/symlink.c
282 @@ -35,6 +35,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
283 .follow_link = page_follow_link_light,
284 .put_link = page_put_link,
285 .setattr = ext4_setattr,
286 + .update_time = ext4_update_time,
287 .setxattr = generic_setxattr,
288 .getxattr = generic_getxattr,
289 .listxattr = ext4_listxattr,
290 @@ -45,6 +46,7 @@ const struct inode_operations ext4_fast_symlink_inode_operations = {
291 .readlink = generic_readlink,
292 .follow_link = ext4_follow_link,
293 .setattr = ext4_setattr,
294 + .update_time = ext4_update_time,
295 .setxattr = generic_setxattr,
296 .getxattr = generic_getxattr,
297 .listxattr = ext4_listxattr,
298 diff --git a/fs/inode.c b/fs/inode.c
299 index 26753ba..cde073a 100644
302 @@ -1280,6 +1280,42 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
304 EXPORT_SYMBOL(ilookup);
307 + * find_active_inode_nowait - find an active inode in the inode cache
308 + * @sb: super block of file system to search
309 + * @ino: inode number to search for
311 + * Search for an active inode @ino in the inode cache, and if the
312 + * inode is in the cache, the inode is returned with an incremented
313 + * reference count. If the inode is being freed or is newly
314 + * initialized, return nothing instead of trying to wait for the inode
315 + * initialization or destruction to be complete.
317 +struct inode *find_active_inode_nowait(struct super_block *sb,
320 + struct hlist_head *head = inode_hashtable + hash(sb, ino);
321 + struct inode *inode, *ret_inode = NULL;
323 + spin_lock(&inode_hash_lock);
324 + hlist_for_each_entry(inode, head, i_hash) {
325 + if ((inode->i_ino != ino) ||
326 + (inode->i_sb != sb))
328 + spin_lock(&inode->i_lock);
329 + if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) == 0) {
333 + spin_unlock(&inode->i_lock);
337 + spin_unlock(&inode_hash_lock);
340 +EXPORT_SYMBOL(find_active_inode_nowait);
342 int insert_inode_locked(struct inode *inode)
344 struct super_block *sb = inode->i_sb;
345 diff --git a/include/linux/fs.h b/include/linux/fs.h
346 index 9ab779e..b5e6b6b 100644
347 --- a/include/linux/fs.h
348 +++ b/include/linux/fs.h
349 @@ -2410,6 +2410,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
351 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
352 extern struct inode * iget_locked(struct super_block *, unsigned long);
353 +extern struct inode *find_active_inode_nowait(struct super_block *,
355 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
356 extern int insert_inode_locked(struct inode *);
357 #ifdef CONFIG_DEBUG_LOCK_ALLOC