1 vfs: add support for a lazytime mount option
3 Add a new mount option which enables a new "lazytime" mode. This mode
4 causes atime, mtime, and ctime updates to only be made to the
5 in-memory version of the inode. The on-disk times will only get
6 updated when (a) if the inode needs to be updated for some non-time
7 related change, (b) if userspace calls fsync(), syncfs() or sync(), or
8 (c) just before an undeleted inode is evicted from memory.
10 This is OK according to POSIX because there are no guarantees after a
11 crash unless userspace explicitly requests via a fsync(2) call.
13 For workloads which feature a large number of random write to a
14 preallocated file, the lazytime mount option significantly reduces
15 writes to the inode table. The repeated 4k writes to a single block
16 will result in undesirable stress on flash devices and SMR disk
17 drives. Even on conventional HDD's, the repeated writes to the inode
18 table block will trigger Adjacent Track Interference (ATI) remediation
19 latencies, which very negatively impact 99.9 percentile latencies ---
20 which is a very big deal for web serving tiers (for example).
22 Google-Bug-Id: 18297052
24 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
26 fs/fs-writeback.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------
27 fs/inode.c | 43 +++++++++++++++++++++++++++++++++++++------
29 fs/logfs/readwrite.c | 2 +-
32 fs/proc_namespace.c | 1 +
33 fs/sync.c | 8 ++++++++
34 fs/ufs/truncate.c | 2 +-
35 include/linux/backing-dev.h | 1 +
36 include/linux/fs.h | 11 +++++++++--
37 include/uapi/linux/fs.h | 1 +
38 mm/backing-dev.c | 9 +++++++--
39 13 files changed, 118 insertions(+), 21 deletions(-)
41 diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
42 index ef9bef1..ef8c5d8 100644
43 --- a/fs/fs-writeback.c
44 +++ b/fs/fs-writeback.c
45 @@ -397,7 +397,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
46 * shot. If still dirty, it will be redirty_tail()'ed below. Update
47 * the dirty time to prevent enqueue and sync it again.
49 - if ((inode->i_state & I_DIRTY) &&
50 + if ((inode->i_state & I_DIRTY_WB) &&
51 (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
52 inode->dirtied_when = jiffies;
54 @@ -428,13 +428,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
56 redirty_tail(inode, wb);
58 - } else if (inode->i_state & I_DIRTY) {
59 + } else if (inode->i_state & I_DIRTY_WB) {
61 * Filesystems can dirty the inode during writeback operations,
62 * such as delayed allocation during submission or metadata
63 * updates after data IO completion.
65 redirty_tail(inode, wb);
66 + } else if (inode->i_state & I_DIRTY_TIME) {
67 + list_move(&inode->i_wb_list, &wb->b_dirty_time);
69 /* The inode is clean. Remove from writeback lists. */
70 list_del_init(&inode->i_wb_list);
71 @@ -482,11 +484,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
72 /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
73 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
74 inode->i_state &= ~I_DIRTY_PAGES;
75 - dirty = inode->i_state & I_DIRTY;
76 - inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
77 + dirty = inode->i_state & I_DIRTY_INODE;
78 + inode->i_state &= ~I_DIRTY_INODE;
79 spin_unlock(&inode->i_lock);
80 /* Don't write the inode if only I_DIRTY_PAGES was set */
81 - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
83 int err = write_inode(inode, wbc);
86 @@ -1162,7 +1164,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
88 spin_lock(&inode->i_lock);
89 if ((inode->i_state & flags) != flags) {
90 - const int was_dirty = inode->i_state & I_DIRTY;
91 + const int was_dirty = inode->i_state & I_DIRTY_WB;
93 inode->i_state |= flags;
95 @@ -1224,6 +1226,24 @@ out_unlock_inode:
97 EXPORT_SYMBOL(__mark_inode_dirty);
99 +void inode_requeue_dirtytime(struct inode *inode)
101 + struct backing_dev_info *bdi = inode_to_bdi(inode);
103 + spin_lock(&bdi->wb.list_lock);
104 + spin_lock(&inode->i_lock);
105 + if ((inode->i_state & I_DIRTY_WB) == 0) {
106 + if (inode->i_state & I_DIRTY_TIME)
107 + list_move(&inode->i_wb_list, &bdi->wb.b_dirty_time);
109 + list_del_init(&inode->i_wb_list);
111 + spin_unlock(&inode->i_lock);
112 + spin_unlock(&bdi->wb.list_lock);
115 +EXPORT_SYMBOL(inode_requeue_dirtytime);
117 static void wait_sb_inodes(struct super_block *sb)
119 struct inode *inode, *old_inode = NULL;
120 @@ -1277,6 +1297,28 @@ static void wait_sb_inodes(struct super_block *sb)
125 + * Take all of the indoes on the dirty_time list, and mark them as
126 + * dirty, so they will be written out.
128 +static void flush_sb_dirty_time(struct super_block *sb)
130 + struct bdi_writeback *wb = &sb->s_bdi->wb;
133 + spin_lock(&wb->list_lock);
134 + list_cut_position(&tmp, &wb->b_dirty_time, wb->b_dirty_time.prev);
135 + while (!list_empty(&tmp)) {
136 + struct inode *inode = wb_inode(tmp.prev);
138 + list_del_init(&inode->i_wb_list);
139 + spin_unlock(&wb->list_lock);
140 + mark_inode_dirty_sync(inode);
141 + spin_lock(&wb->list_lock);
143 + spin_unlock(&wb->list_lock);
147 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
148 * @sb: the superblock
149 @@ -1388,6 +1430,7 @@ void sync_inodes_sb(struct super_block *sb)
151 WARN_ON(!rwsem_is_locked(&sb->s_umount));
153 + flush_sb_dirty_time(sb);
154 bdi_queue_work(sb->s_bdi, &work);
155 wait_for_completion(&done);
157 diff --git a/fs/inode.c b/fs/inode.c
158 index 94bc908..009d9c5 100644
162 * inode_sb_list_lock protects:
163 * sb->s_inodes, inode->i_sb_list
164 * bdi->wb.list_lock protects:
165 - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
166 + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
167 * inode_hash_lock protects:
168 * inode_hashtable, inode->i_hash
170 @@ -1430,11 +1430,22 @@ static void iput_final(struct inode *inode)
172 void iput(struct inode *inode)
175 - BUG_ON(inode->i_state & I_CLEAR);
177 - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
181 + BUG_ON(inode->i_state & I_CLEAR);
183 + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
184 + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
185 + atomic_inc(&inode->i_count);
186 + inode->i_state &= ~I_DIRTY_TIME;
187 + spin_unlock(&inode->i_lock);
188 + if (inode->i_op->write_time)
189 + inode->i_op->write_time(inode);
190 + else if (inode->i_sb->s_op->write_inode)
191 + mark_inode_dirty_sync(inode);
198 @@ -1516,6 +1527,26 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
199 inode->i_mtime = *time;
200 if (inode->i_op->update_time)
201 inode->i_op->update_time(inode);
202 + if ((inode->i_sb->s_flags & MS_LAZYTIME) &&
203 + !(flags & S_VERSION) &&
204 + !(inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))) {
205 + if (inode->i_state & I_DIRTY_TIME)
207 + spin_lock(&inode->i_lock);
208 + if (inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
209 + spin_unlock(&inode->i_lock);
212 + if (inode->i_state & I_DIRTY_TIME) {
213 + spin_unlock(&inode->i_lock);
216 + inode->i_state |= I_DIRTY_TIME;
217 + spin_unlock(&inode->i_lock);
218 + inode_requeue_dirtytime(inode);
222 if (inode->i_op->write_time)
223 return inode->i_op->write_time(inode);
224 mark_inode_dirty_sync(inode);
225 diff --git a/fs/libfs.c b/fs/libfs.c
226 index 171d284..b9923b2 100644
229 @@ -1066,7 +1066,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
230 * list because mark_inode_dirty() will think
231 * that it already _is_ on the dirty list.
233 - inode->i_state = I_DIRTY;
234 + inode->i_state = I_DIRTY_WB;
235 inode->i_mode = S_IRUSR | S_IWUSR;
236 inode->i_uid = current_fsuid();
237 inode->i_gid = current_fsgid();
238 diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
239 index 380d86e..5521842 100644
240 --- a/fs/logfs/readwrite.c
241 +++ b/fs/logfs/readwrite.c
242 @@ -2187,7 +2187,7 @@ void logfs_evict_inode(struct inode *inode)
243 * aliases, which are moved back. No write to the medium happens.
245 /* Only deleted files may be dirty at this point */
246 - BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
247 + BUG_ON(inode->i_state & I_DIRTY_WB && inode->i_nlink);
250 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
251 diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
252 index 989129e..818c6fa 100644
255 @@ -915,7 +915,7 @@ static int wait_for_concurrent_writes(struct file *file)
256 dprintk("nfsd: write resume %d\n", task_pid_nr(current));
259 - if (inode->i_state & I_DIRTY) {
260 + if (inode->i_state & I_DIRTY_WB) {
261 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
262 err = vfs_fsync(file, 0);
264 diff --git a/fs/pipe.c b/fs/pipe.c
265 index 21981e5..fc9b923 100644
268 @@ -660,7 +660,7 @@ static struct inode * get_pipe_inode(void)
269 * list because "mark_inode_dirty()" will think
270 * that it already _is_ on the dirty list.
272 - inode->i_state = I_DIRTY;
273 + inode->i_state = I_DIRTY_WB;
274 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
275 inode->i_uid = current_fsuid();
276 inode->i_gid = current_fsgid();
277 diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
278 index 73ca174..f98234a 100644
279 --- a/fs/proc_namespace.c
280 +++ b/fs/proc_namespace.c
281 @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
282 { MS_SYNCHRONOUS, ",sync" },
283 { MS_DIRSYNC, ",dirsync" },
284 { MS_MANDLOCK, ",mand" },
285 + { MS_LAZYTIME, ",lazytime" },
288 const struct proc_fs_info *fs_infop;
289 diff --git a/fs/sync.c b/fs/sync.c
290 index bdc729d..6ac7bf0 100644
293 @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
295 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
297 + struct inode *inode = file->f_mapping->host;
299 if (!file->f_op->fsync)
301 + if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
302 + spin_lock(&inode->i_lock);
303 + inode->i_state &= ~I_DIRTY_TIME;
304 + spin_unlock(&inode->i_lock);
305 + mark_inode_dirty_sync(inode);
307 return file->f_op->fsync(file, start, end, datasync);
309 EXPORT_SYMBOL(vfs_fsync_range);
310 diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
311 index f04f89f..1d00a09 100644
312 --- a/fs/ufs/truncate.c
313 +++ b/fs/ufs/truncate.c
314 @@ -477,7 +477,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
315 retry |= ufs_trunc_tindirect (inode);
318 - if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
319 + if (IS_SYNC(inode) && (inode->i_state & I_DIRTY_WB))
320 ufs_sync_inode (inode);
323 diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
324 index 5da6012..4cdf733 100644
325 --- a/include/linux/backing-dev.h
326 +++ b/include/linux/backing-dev.h
327 @@ -55,6 +55,7 @@ struct bdi_writeback {
328 struct list_head b_dirty; /* dirty inodes */
329 struct list_head b_io; /* parked for writeback */
330 struct list_head b_more_io; /* parked for more writeback */
331 + struct list_head b_dirty_time; /* time stamps are dirty */
332 spinlock_t list_lock; /* protects the b_* lists */
335 diff --git a/include/linux/fs.h b/include/linux/fs.h
336 index befd5d2..6a6f56b 100644
337 --- a/include/linux/fs.h
338 +++ b/include/linux/fs.h
339 @@ -1722,19 +1722,26 @@ struct super_operations {
340 #define __I_DIO_WAKEUP 9
341 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP)
342 #define I_LINKABLE (1 << 10)
343 +#define I_DIRTY_TIME (1 << 11)
345 -#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
346 +/* Inode should be on the b_dirty/b_io/b_more_io lists */
347 +#define I_DIRTY_WB (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
348 +/* Inode should be on the b_dirty/b_io/b_more_io/b_dirty_time lists */
349 +#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES | I_DIRTY_TIME)
350 +/* The inode itself is dirty */
351 +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)
353 extern void __mark_inode_dirty(struct inode *, int);
354 static inline void mark_inode_dirty(struct inode *inode)
356 - __mark_inode_dirty(inode, I_DIRTY);
357 + __mark_inode_dirty(inode, I_DIRTY_WB);
360 static inline void mark_inode_dirty_sync(struct inode *inode)
362 __mark_inode_dirty(inode, I_DIRTY_SYNC);
364 +extern void inode_requeue_dirtytime(struct inode *);
366 extern void inc_nlink(struct inode *inode);
367 extern void drop_nlink(struct inode *inode);
368 diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
369 index 3735fa0..cc9713a 100644
370 --- a/include/uapi/linux/fs.h
371 +++ b/include/uapi/linux/fs.h
372 @@ -90,6 +90,7 @@ struct inodes_stat_t {
373 #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
374 #define MS_I_VERSION (1<<23) /* Update inode I_version field */
375 #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
376 +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
378 /* These sb flags are internal to the kernel */
379 #define MS_NOSEC (1<<28)
380 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
381 index 0ae0df5..14851fe 100644
382 --- a/mm/backing-dev.c
383 +++ b/mm/backing-dev.c
384 @@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
385 unsigned long background_thresh;
386 unsigned long dirty_thresh;
387 unsigned long bdi_thresh;
388 - unsigned long nr_dirty, nr_io, nr_more_io;
389 + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
392 - nr_dirty = nr_io = nr_more_io = 0;
393 + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
394 spin_lock(&wb->list_lock);
395 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
397 @@ -80,6 +80,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
399 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
401 + list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
403 spin_unlock(&wb->list_lock);
405 global_dirty_limits(&background_thresh, &dirty_thresh);
406 @@ -98,6 +100,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
410 + "b_dirty_time: %10lu\n"
413 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
414 @@ -111,6 +114,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
419 !list_empty(&bdi->bdi_list), bdi->state);
422 @@ -418,6 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
423 INIT_LIST_HEAD(&wb->b_dirty);
424 INIT_LIST_HEAD(&wb->b_io);
425 INIT_LIST_HEAD(&wb->b_more_io);
426 + INIT_LIST_HEAD(&wb->b_dirty_time);
427 spin_lock_init(&wb->list_lock);
428 INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);