1 fs: make sure the timestamps for lazytime inodes eventually get written
3 Jan Kara pointed out that if there is an inode which is constantly
4 getting dirtied with I_DIRTY_PAGES, an inode with an updated timestamp
5 will never be written since inode->dirtied_when is constantly getting
6 updated. We fix this by adding an extra field to the inode,
7 dirtied_time_when, so inodes with a stale dirtytime can get detected
10 In addition, if we have a dirtytime inode caused by an atime update,
11 and there is no write activity on the file system, we need to have a
12 secondary system to make sure these inodes get written out. We do
13 this by setting up a second delayed work structure which wakes up the
14 CPU much more rarely compared to writeback_expire_centisecs.
16 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
17 Reviewed-by: Jan Kara <jack@suse.cz>
19 fs/fs-writeback.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------
20 include/linux/fs.h | 1 +
21 2 files changed, 73 insertions(+), 10 deletions(-)
23 diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
24 index e907052..2cfcd74 100644
25 --- a/fs/fs-writeback.c
26 +++ b/fs/fs-writeback.c
27 @@ -53,6 +53,18 @@ struct wb_writeback_work {
28 struct completion *done; /* set if the caller waits */
32 + * If an inode is constantly having its pages dirtied, but then the
33 + * updates stop dirtytime_expire_interval seconds in the past, it's
34 + * possible for the worst case time between when an inode has its
35 + * timestamps updated and when they finally get written out to be two
36 + * dirtytime_expire_intervals. We set the default to 12 hours (in
37 + * seconds), which means most of the time inodes will have their
38 + * timestamps written to disk after 12 hours, but in the worst case a
39 + * few inodes might not their timestamps updated for 24 hours.
41 +unsigned int dirtytime_expire_interval = 12 * 60 * 60;
44 * writeback_in_progress - determine whether there is writeback in progress
45 * @bdi: the device's backing_dev_info structure.
46 @@ -275,8 +287,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
48 if ((flags & EXPIRE_DIRTY_ATIME) == 0)
49 older_than_this = work->older_than_this;
50 - else if ((work->reason == WB_REASON_SYNC) == 0) {
51 - expire_time = jiffies - (HZ * 86400);
52 + else if (!work->for_sync) {
53 + expire_time = jiffies - (dirtytime_expire_interval * HZ);
54 older_than_this = &expire_time;
56 while (!list_empty(delaying_queue)) {
57 @@ -458,6 +470,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
59 redirty_tail(inode, wb);
60 } else if (inode->i_state & I_DIRTY_TIME) {
61 + inode->dirtied_when = jiffies;
62 list_move(&inode->i_wb_list, &wb->b_dirty_time);
64 /* The inode is clean. Remove from writeback lists. */
65 @@ -505,12 +518,17 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
66 spin_lock(&inode->i_lock);
68 dirty = inode->i_state & I_DIRTY;
69 - if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
70 - (inode->i_state & I_DIRTY_TIME)) ||
71 - (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
72 - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
73 - trace_writeback_lazytime(inode);
75 + if (inode->i_state & I_DIRTY_TIME) {
76 + if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
77 + unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
78 + unlikely(time_after(jiffies,
79 + (inode->dirtied_time_when +
80 + dirtytime_expire_interval * HZ)))) {
81 + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
82 + trace_writeback_lazytime(inode);
85 + inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
86 inode->i_state &= ~dirty;
89 @@ -1131,6 +1149,45 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
94 + * Wake up bdi's periodically to make sure dirtytime inodes gets
95 + * written back periodically. We deliberately do *not* check the
96 + * b_dirtytime list in wb_has_dirty_io(), since this would cause the
97 + * kernel to be constantly waking up once there are any dirtytime
98 + * inodes on the system. So instead we define a separate delayed work
99 + * function which gets called much more rarely. (By default, only
100 + * once every 12 hours.)
102 + * If there is any other write activity going on in the file system,
103 + * this function won't be necessary. But if the only thing that has
104 + * happened on the file system is a dirtytime inode caused by an atime
105 + * update, we need this infrastructure below to make sure that inode
106 + * eventually gets pushed out to disk.
108 +static void wakeup_dirtytime_writeback(struct work_struct *w);
109 +static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
111 +static void wakeup_dirtytime_writeback(struct work_struct *w)
113 + struct backing_dev_info *bdi;
116 + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
117 + if (list_empty(&bdi->wb.b_dirty_time))
119 + bdi_wakeup_thread(bdi);
122 + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
125 +static int __init start_dirtytime_writeback(void)
127 + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
130 +__initcall(start_dirtytime_writeback);
132 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
134 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
135 @@ -1269,8 +1326,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
138 inode->dirtied_when = jiffies;
139 - list_move(&inode->i_wb_list, dirtytime ?
140 - &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
142 + inode->dirtied_time_when = jiffies;
143 + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
144 + list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
146 + list_move(&inode->i_wb_list,
147 + &bdi->wb.b_dirty_time);
148 spin_unlock(&bdi->wb.list_lock);
149 trace_writeback_dirty_inode_enqueue(inode);
151 diff --git a/include/linux/fs.h b/include/linux/fs.h
152 index b4d71b5..f4131e8 100644
153 --- a/include/linux/fs.h
154 +++ b/include/linux/fs.h
155 @@ -604,6 +604,7 @@ struct inode {
156 struct mutex i_mutex;
158 unsigned long dirtied_when; /* jiffies of first dirtying */
159 + unsigned long dirtied_time_when;
161 struct hlist_node i_hash;
162 struct list_head i_wb_list; /* backing dev IO list */