1 ext4: fix races between changing inode journal mode and ext4_writepages
3 From: Daeho Jeong <daeho.jeong@samsung.com>
5 In ext4, there is a race condition between changing inode journal mode
6 and ext4_writepages(). While ext4_writepages() is executed on a
7 non-journalled mode inode, the inode's journal mode could be enabled
8 by ioctl() and then, some pages dirtied after switching the journal
9 mode will be still exposed to ext4_writepages() in non-journaled mode.
10 To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan
11 Kara's suggestion because we don't want to waste ext4_inode_info's
12 space for this extra rare case.
14 Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
15 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
16 Reviewed-by: Jan Kara <jack@suse.cz>
18 fs/ext4/ext4.h | 4 ++++
19 fs/ext4/inode.c | 15 ++++++++++++---
20 fs/ext4/super.c | 4 ++++
21 kernel/locking/percpu-rwsem.c | 1 +
22 4 files changed, 21 insertions(+), 3 deletions(-)
24 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
25 index 157b458..c757a3d 100644
29 #include <linux/ratelimit.h>
30 #include <crypto/hash.h>
31 #include <linux/falloc.h>
32 +#include <linux/percpu-rwsem.h>
34 #include <linux/compat.h>
36 @@ -1475,6 +1476,9 @@ struct ext4_sb_info {
37 struct ratelimit_state s_err_ratelimit_state;
38 struct ratelimit_state s_warning_ratelimit_state;
39 struct ratelimit_state s_msg_ratelimit_state;
41 + /* Barrier between changing inodes' journal flags and writepages ops. */
42 + struct percpu_rw_semaphore s_journal_flag_rwsem;
45 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
46 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
47 index 71fab4c..4f45f24 100644
50 @@ -2476,11 +2476,14 @@ static int ext4_writepages(struct address_space *mapping,
52 bool give_up_on_write = false;
54 + percpu_down_read(&sbi->s_journal_flag_rwsem);
55 trace_ext4_writepages(inode, wbc);
57 - if (dax_mapping(mapping))
58 - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
60 + if (dax_mapping(mapping)) {
61 + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
63 + goto out_writepages;
67 * No pages to write? This is mainly a kludge to avoid starting
68 @@ -2650,6 +2653,7 @@ retry:
70 trace_ext4_writepages_result(inode, wbc, ret,
71 nr_to_write - wbc->nr_to_write);
72 + percpu_up_read(&sbi->s_journal_flag_rwsem);
76 @@ -5366,6 +5370,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
80 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
83 * We have to be very careful here: changing a data block's
84 @@ -5405,6 +5410,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
88 + percpu_down_write(&sbi->s_journal_flag_rwsem);
89 jbd2_journal_lock_updates(journal);
92 @@ -5421,6 +5427,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
93 err = jbd2_journal_flush(journal);
95 jbd2_journal_unlock_updates(journal);
96 + percpu_up_write(&sbi->s_journal_flag_rwsem);
97 ext4_inode_resume_unlocked_dio(inode);
100 @@ -5429,6 +5436,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
101 ext4_set_aops(inode);
103 jbd2_journal_unlock_updates(journal);
104 + percpu_up_write(&sbi->s_journal_flag_rwsem);
107 up_write(&EXT4_I(inode)->i_mmap_sem);
108 ext4_inode_resume_unlocked_dio(inode);
109 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
110 index 3ed01ec..a12950d 100644
111 --- a/fs/ext4/super.c
112 +++ b/fs/ext4/super.c
113 @@ -861,6 +861,7 @@ static void ext4_put_super(struct super_block *sb)
114 percpu_counter_destroy(&sbi->s_freeinodes_counter);
115 percpu_counter_destroy(&sbi->s_dirs_counter);
116 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
117 + percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
120 for (i = 0; i < EXT4_MAXQUOTAS; i++)
121 @@ -3926,6 +3927,9 @@ no_journal:
123 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
126 + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
129 ext4_msg(sb, KERN_ERR, "insufficient memory");
131 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
132 index f231e0b..bec0b64 100644
133 --- a/kernel/locking/percpu-rwsem.c
134 +++ b/kernel/locking/percpu-rwsem.c
135 @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
136 free_percpu(brw->fast_read_ctr);
137 brw->fast_read_ctr = NULL; /* catch use after free bugs */
139 +EXPORT_SYMBOL_GPL(percpu_free_rwsem);
142 * This is the fast-path for down_read/up_read. If it succeeds we rely