1 ext4: add journal_lazy mount option
3 This option turns out the lazy journalling option, as described in the
4 FAST 2017 paper, "Evolving Ext4 for Shingled Disks"[1].
6 [1] https://www.usenix.org/conference/fast17/technical-sessions/presentation/aghayev
8 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
11 fs/ext4/inode.c | 2 +-
12 fs/ext4/ioctl.c | 48 +++++++++++++++++++++++++++++++++---------------
13 fs/ext4/super.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------------
14 4 files changed, 79 insertions(+), 28 deletions(-)
16 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
17 index d3108a82f0fb..e76696b303d8 100644
20 @@ -1145,6 +1145,7 @@ struct ext4_inode_info {
21 #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
22 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
23 #define EXT4_MOUNT_JOURNAL_NOCLEANUP 0x2000000 /* Preserve the journal on unmount */
24 +#define EXT4_MOUNT_JOURNAL_LAZY 0x4000000 /* Do lazy writeback of journalled metadata */
25 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
26 #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
27 #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
28 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
29 index 4b5892e31b80..374a9767a0ed 100644
32 @@ -3190,7 +3190,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
33 filemap_write_and_wait(mapping);
36 - if (EXT4_JOURNAL(inode) &&
37 + if (EXT4_JOURNAL(inode) && !test_opt(inode->i_sb, JOURNAL_LAZY) &&
38 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
40 * This is a REALLY heavyweight approach, but the use of
41 diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
42 index b383ebf4020c..ba234eb5a1bd 100644
45 @@ -242,6 +242,20 @@ static int ext4_ioctl_setflags(struct inode *inode,
46 if (!capable(CAP_SYS_RESOURCE))
51 + * Clearing the JOURNAL_DATA flag is *hard* with lazy
52 + * journalling. We can't use jbd2_journal_flush(); instead,
53 + * we would have to make sure all blocks belonging to the file
54 + * are evacuated from the journal and saved to their final
55 + * location on disk. Punt for now.
57 + if ((oldflags & EXT4_JOURNAL_DATA_FL) && !jflag &&
58 + test_opt(inode->i_sb, JOURNAL_LAZY)) {
63 if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
66 @@ -489,6 +503,22 @@ int ext4_goingdown(struct super_block *sb, unsigned long arg)
71 + * If we are using journalling (excepting JBD2 lazy mode), make sure
72 + * the block group descriptors are written out immediately
74 +static int flush_fs_group_descriptors(struct super_block *sb)
78 + if (EXT4_SB(sb)->s_journal && !test_opt(sb, JOURNAL_LAZY)) {
79 + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
80 + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
81 + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
86 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
88 struct inode *inode = file_inode(filp);
89 @@ -606,11 +636,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
90 goto group_extend_out;
92 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
93 - if (EXT4_SB(sb)->s_journal) {
94 - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
95 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
96 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
98 + err2 = flush_fs_group_descriptors(sb);
101 mnt_drop_write_file(filp);
102 @@ -696,11 +722,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
105 err = ext4_group_add(sb, &input);
106 - if (EXT4_SB(sb)->s_journal) {
107 - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
108 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
109 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
111 + err2 = flush_fs_group_descriptors(sb);
114 mnt_drop_write_file(filp);
115 @@ -786,11 +808,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 err = ext4_resize_fs(sb, n_blocks_count);
119 - if (EXT4_SB(sb)->s_journal) {
120 - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
121 - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
122 - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
124 + err2 = flush_fs_group_descriptors(sb);
127 mnt_drop_write_file(filp);
128 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
129 index d411cc613bf0..00ecc96be253 100644
130 --- a/fs/ext4/super.c
131 +++ b/fs/ext4/super.c
132 @@ -868,7 +868,8 @@ static void ext4_put_super(struct super_block *sb)
133 ext4_ext_release(sb);
135 if (!(sb->s_flags & MS_RDONLY) && !aborted &&
136 - !test_opt(sb, JOURNAL_NOCLEANUP)) {
137 + !test_opt(sb, JOURNAL_NOCLEANUP) &&
138 + !test_opt(sb, JOURNAL_LAZY)) {
139 ext4_clear_feature_journal_needs_recovery(sb);
140 es->s_state = cpu_to_le16(sbi->s_mount_state);
142 @@ -1310,6 +1311,7 @@ enum {
143 Opt_inode_readahead_blks, Opt_journal_ioprio,
144 Opt_dioread_nolock, Opt_dioread_lock,
145 Opt_journal_nocleanup, Opt_journal_cleanup,
146 + Opt_journal_nolazy, Opt_journal_lazy,
147 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
148 Opt_max_dir_size_kb, Opt_nojournal_checksum,
150 @@ -1396,6 +1398,8 @@ static const match_table_t tokens = {
151 {Opt_test_dummy_encryption, "test_dummy_encryption"},
152 {Opt_journal_nocleanup, "journal_nocleanup"},
153 {Opt_journal_cleanup, "journal_cleanup"},
154 + {Opt_journal_lazy, "journal_lazy"},
155 + {Opt_journal_nolazy, "journal_nolazy"},
156 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
157 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
158 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
159 @@ -1604,6 +1608,8 @@ static const struct mount_opts {
160 {Opt_test_dummy_encryption, 0, MOPT_GTE0},
161 {Opt_journal_nocleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_SET},
162 {Opt_journal_cleanup, EXT4_MOUNT_JOURNAL_NOCLEANUP, MOPT_CLEAR},
163 + {Opt_journal_lazy, EXT4_MOUNT_JOURNAL_LAZY, MOPT_SET},
164 + {Opt_journal_nolazy, EXT4_MOUNT_JOURNAL_LAZY, MOPT_CLEAR},
168 @@ -4355,6 +4361,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
169 journal->j_flags |= JBD2_NO_CLEANUP;
171 journal->j_flags &= ~JBD2_NO_CLEANUP;
172 + if (test_opt(sb, JOURNAL_LAZY))
173 + journal->j_flags |= JBD2_LAZY;
175 + journal->j_flags &= ~JBD2_LAZY;
176 write_unlock(&journal->j_state_lock);
179 @@ -4588,6 +4598,24 @@ static int ext4_load_journal(struct super_block *sb,
181 EXT4_SB(sb)->s_journal = journal;
182 ext4_clear_journal_err(sb, es);
184 + if (test_opt(sb, JOURNAL_LAZY)) {
185 + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
187 + /* Read the latest version of the superblock from the journal */
189 + clear_buffer_uptodate(sbh);
190 + err = jbd2_bh_submit_read(journal, sbh, __func__);
192 + ext4_msg(sb, KERN_ERR, "error rereading superblock %d",
194 + set_buffer_uptodate(sbh);
196 + if (!ext4_superblock_csum_verify(sb, es))
197 + ext4_msg(sb, KERN_ERR,
198 + "superblock csum doesn't verify"
199 + "after journal replay!");
204 @@ -4674,6 +4702,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
206 journal_t *journal = EXT4_SB(sb)->s_journal;
208 + if (test_opt(sb, JOURNAL_LAZY))
211 if (!ext4_has_feature_journal(sb)) {
212 BUG_ON(journal != NULL);
214 @@ -4810,21 +4841,20 @@ static int ext4_freeze(struct super_block *sb)
215 journal = EXT4_SB(sb)->s_journal;
218 - /* Now we set up the journal barrier. */
219 - jbd2_journal_lock_updates(journal);
222 - * Don't clear the needs_recovery flag if we failed to
223 - * flush the journal.
224 + * Set the journal barrier, then flush the journal and
225 + * clear the needs_recovery flag if we are not in
228 - error = jbd2_journal_flush(journal);
231 + jbd2_journal_lock_updates(journal);
233 - /* Journal blocked and flushed, clear needs_recovery flag. */
234 + if (!test_opt(sb, JOURNAL_LAZY)) {
235 + error = jbd2_journal_flush(journal);
239 ext4_clear_feature_journal_needs_recovery(sb);
242 error = ext4_commit_super(sb, 1);
245 @@ -4842,7 +4872,7 @@ static int ext4_unfreeze(struct super_block *sb)
246 if ((sb->s_flags & MS_RDONLY) || ext4_forced_shutdown(EXT4_SB(sb)))
249 - if (EXT4_SB(sb)->s_journal) {
250 + if (EXT4_SB(sb)->s_journal && !test_opt(sb, JOURNAL_LAZY)) {
251 /* Reset the needs_recovery flag before the fs is unlocked. */
252 ext4_set_feature_journal_needs_recovery(sb);
254 @@ -5358,6 +5388,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
255 * We don't need to lock updates but journal_flush() could
256 * otherwise be livelocked...
258 + if (test_opt(sb, JOURNAL_LAZY))
259 + return -EOPNOTSUPP;
260 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
261 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
262 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);