1 ext4: avoid unnecessarily writing back dirty pages before hole punching
3 From: Li Wang <liwang@ubuntukylin.com>
5 For hole punching, currently ext4 will synchronously write back the
6 dirty pages fit into the hole, since the data on the disk responding
7 to those pages are to be deleted, it is benefical to directly release
8 those pages, no matter they are dirty or not, except the ordered case.
10 [ Fixed error return to unlock i_mutex if ext4_begin_ordered_punch_hole()
11 fails. Thanks to Wei Yongjun <yongjun_wei@trendmicro.com.cn> for
14 Signed-off-by: Li Wang <liwang@ubuntukylin.com>
15 Signed-off-by: Yunchuan Wen <yunchuanwen@ubuntukylin.com>
16 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
17 Cc: Dmitry Monakhov <dmonakhov@openvz.org>
18 Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
19 Reviewed-by: Jan Kara <jack@suse.cz>
21 fs/ext4/inode.c | 28 ++++++++++++++++------------
22 fs/jbd2/journal.c | 2 +-
23 fs/jbd2/transaction.c | 29 ++++++-----------------------
24 include/linux/jbd2.h | 33 +++++++++++++++++++++++++++++++--
25 4 files changed, 54 insertions(+), 38 deletions(-)
27 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
28 index 0db830d..06136b5 100644
31 @@ -3466,6 +3466,16 @@ int ext4_can_truncate(struct inode *inode)
35 +static inline int ext4_begin_ordered_punch_hole(struct inode *inode,
36 + loff_t start, loff_t length)
38 + if (!EXT4_I(inode)->jinode)
40 + return jbd2_journal_begin_ordered_punch_hole(EXT4_JOURNAL(inode),
41 + EXT4_I(inode)->jinode,
42 + start, start+length-1);
46 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
47 * associated with the given offset and length
48 @@ -3482,7 +3492,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
49 struct inode *inode = file_inode(file);
50 struct super_block *sb = inode->i_sb;
51 ext4_lblk_t first_block, stop_block;
52 - struct address_space *mapping = inode->i_mapping;
53 loff_t first_block_offset, last_block_offset;
56 @@ -3498,17 +3507,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
58 trace_ext4_punch_hole(inode, offset, length);
61 - * Write out all dirty pages to avoid race conditions
62 - * Then release them.
64 - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
65 - ret = filemap_write_and_wait_range(mapping, offset,
66 - offset + length - 1);
71 mutex_lock(&inode->i_mutex);
72 /* It's not possible punch hole on append only file */
73 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
74 @@ -3537,6 +3535,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
75 first_block_offset = round_up(offset, sb->s_blocksize);
76 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
78 + if (ext4_should_order_data(inode)) {
79 + ret = ext4_begin_ordered_punch_hole(inode, offset, length);
84 /* Now release the pages and zero block aligned part of pages*/
85 if (last_block_offset > first_block_offset)
86 truncate_pagecache_range(inode, first_block_offset,
87 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
88 index 915dd57..4c8b8d4 100644
89 --- a/fs/jbd2/journal.c
90 +++ b/fs/jbd2/journal.c
91 @@ -97,7 +97,7 @@ EXPORT_SYMBOL(jbd2_journal_force_commit);
92 EXPORT_SYMBOL(jbd2_journal_file_inode);
93 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
94 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
95 -EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
96 +EXPORT_SYMBOL(jbd2_journal_begin_ordered_punch_hole);
97 EXPORT_SYMBOL(jbd2_inode_cache);
99 static void __journal_abort_soft (journal_t *journal, int errno);
100 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
101 index dd422e6..91d62e1 100644
102 --- a/fs/jbd2/transaction.c
103 +++ b/fs/jbd2/transaction.c
104 @@ -2419,29 +2419,10 @@ done:
109 - * File truncate and transaction commit interact with each other in a
110 - * non-trivial way. If a transaction writing data block A is
111 - * committing, we cannot discard the data by truncate until we have
112 - * written them. Otherwise if we crashed after the transaction with
113 - * write has committed but before the transaction with truncate has
114 - * committed, we could see stale data in block A. This function is a
115 - * helper to solve this problem. It starts writeout of the truncated
116 - * part in case it is in the committing transaction.
118 - * Filesystem code must call this function when inode is journaled in
119 - * ordered mode before truncation happens and after the inode has been
120 - * placed on orphan list with the new inode size. The second condition
121 - * avoids the race that someone writes new data and we start
122 - * committing the transaction after this function has been called but
123 - * before a transaction for truncate is started (and furthermore it
124 - * allows us to optimize the case where the addition to orphan list
125 - * happens in the same transaction as write --- we don't have to write
126 - * any data in such case).
128 -int jbd2_journal_begin_ordered_truncate(journal_t *journal,
130 +int jbd2_journal_begin_ordered_punch_hole(journal_t *journal,
131 struct jbd2_inode *jinode,
133 + loff_t start, loff_t end)
135 transaction_t *inode_trans, *commit_trans;
137 @@ -2460,10 +2441,12 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
138 spin_unlock(&journal->j_list_lock);
139 if (inode_trans == commit_trans) {
140 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
141 - new_size, LLONG_MAX);
144 jbd2_journal_abort(journal, ret);
151 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
152 index 0302f3f..5f3c094 100644
153 --- a/include/linux/jbd2.h
154 +++ b/include/linux/jbd2.h
155 @@ -1157,12 +1157,41 @@ extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *
156 extern int jbd2_journal_force_commit(journal_t *);
157 extern int jbd2_journal_force_commit_nested(journal_t *);
158 extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
159 -extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
160 - struct jbd2_inode *inode, loff_t new_size);
161 +extern int jbd2_journal_begin_ordered_punch_hole(journal_t *,
162 + struct jbd2_inode *,
164 extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
165 extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
168 + * File truncate and transaction commit interact with each other in a
169 + * non-trivial way. If a transaction writing data block A is
170 + * committing, we cannot discard the data by truncate until we have
171 + * written them. Otherwise if we crashed after the transaction with
172 + * write has committed but before the transaction with truncate has
173 + * committed, we could see stale data in block A. This function is a
174 + * helper to solve this problem. It starts writeout of the truncated
175 + * part in case it is in the committing transaction.
177 + * Filesystem code must call this function when inode is journaled in
178 + * ordered mode before truncation happens and after the inode has been
179 + * placed on orphan list with the new inode size. The second condition
180 + * avoids the race that someone writes new data and we start
181 + * committing the transaction after this function has been called but
182 + * before a transaction for truncate is started (and furthermore it
183 + * allows us to optimize the case where the addition to orphan list
184 + * happens in the same transaction as write --- we don't have to write
185 + * any data in such case).
187 +static inline int jbd2_journal_begin_ordered_truncate(journal_t *journal,
188 + struct jbd2_inode *jinode,
191 + return jbd2_journal_begin_ordered_punch_hole(journal, jinode,
192 + new_size, LLONG_MAX);
196 * journal_head management
198 struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);