1 jbd2: refine waiting for shadow buffers
3 From: Jan Kara <jack@suse.cz>
5 Currently when we add a buffer to a transaction, we wait until the
6 buffer is removed from BJ_Shadow list (so that we prevent any changes
7 to the buffer that is just written to the journal). This can take
8 unnecessarily long as a lot happens between the time the buffer is
9 submitted to the journal and the time when we remove the buffer from
10 BJ_Shadow list. (e.g. We wait for all data buffers in the
11 transaction, we issue a cache flush, etc.) Also this creates a
12 dependency of do_get_write_access() on transaction commit (namely
13 waiting for data IO to complete) which we want to avoid when
14 implementing transaction reservation.
16 So we modify commit code to set new BH_Shadow flag when temporary
17 shadowing buffer is created and we clear that flag once IO on that
18 buffer is complete. This allows do_get_write_access() to wait only
19 for BH_Shadow bit and thus removes the dependency on data IO
22 Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
23 Signed-off-by: Jan Kara <jack@suse.cz>
24 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
26 fs/jbd2/commit.c | 18 +++++++++---------
27 fs/jbd2/journal.c | 2 ++
28 fs/jbd2/transaction.c | 44 +++++++++++++++++++-------------------------
29 include/linux/jbd.h | 25 +++++++++++++++++++++++++
30 include/linux/jbd2.h | 28 ++++++++++++++++++++++++++++
31 include/linux/jbd_common.h | 26 --------------------------
32 6 files changed, 83 insertions(+), 60 deletions(-)
34 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
35 index dd92fc7..b992e16 100644
36 --- a/fs/jbd2/commit.c
37 +++ b/fs/jbd2/commit.c
39 #include <trace/events/jbd2.h>
42 - * Default IO end handler for temporary BJ_IO buffer_heads.
43 + * IO end handler for temporary buffer_heads handling writes to the journal.
45 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
47 + struct buffer_head *orig_bh = bh->b_private;
51 set_buffer_uptodate(bh);
53 clear_buffer_uptodate(bh);
55 + clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
56 + smp_mb__after_clear_bit();
57 + wake_up_bit(&orig_bh->b_state, BH_Shadow);
62 @@ -831,6 +838,7 @@ start_journal_io:
64 clear_buffer_jwrite(bh);
65 J_ASSERT_BH(bh, buffer_jbddirty(bh));
66 + J_ASSERT_BH(bh, !buffer_shadow(bh));
68 /* The metadata is now released for reuse, but we need
69 to remember it against this transaction so that when
70 @@ -838,14 +846,6 @@ start_journal_io:
72 JBUFFER_TRACE(jh, "file as BJ_Forget");
73 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
75 - * Wake up any transactions which were waiting for this IO to
76 - * complete. The barrier must be here so that changes by
77 - * jbd2_journal_file_buffer() take effect before wake_up_bit()
78 - * does the waitqueue check.
81 - wake_up_bit(&bh->b_state, BH_Unshadow);
82 JBUFFER_TRACE(jh, "brelse shadowed buffer");
85 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
86 index 96e0594..e812030 100644
87 --- a/fs/jbd2/journal.c
88 +++ b/fs/jbd2/journal.c
89 @@ -451,6 +451,7 @@ repeat:
90 new_bh->b_size = bh_in->b_size;
91 new_bh->b_bdev = journal->j_dev;
92 new_bh->b_blocknr = blocknr;
93 + new_bh->b_private = bh_in;
94 set_buffer_mapped(new_bh);
95 set_buffer_dirty(new_bh);
97 @@ -465,6 +466,7 @@ repeat:
98 spin_lock(&journal->j_list_lock);
99 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
100 spin_unlock(&journal->j_list_lock);
101 + set_buffer_shadow(bh_in);
102 jbd_unlock_bh_state(bh_in);
104 return do_escape | (done_copy_out << 1);
105 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
106 index 213a43b..4d5ef4b 100644
107 --- a/fs/jbd2/transaction.c
108 +++ b/fs/jbd2/transaction.c
109 @@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
110 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
113 +static int sleep_on_shadow_bh(void *word)
120 * If the buffer is already part of the current transaction, then there
121 * is nothing we need to do. If it is already part of a prior
122 @@ -754,41 +760,29 @@ repeat:
123 * journaled. If the primary copy is already going to
124 * disk then we cannot do copy-out here. */
126 - if (jh->b_jlist == BJ_Shadow) {
127 - DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
128 - wait_queue_head_t *wqh;
130 - wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
132 + if (buffer_shadow(bh)) {
133 JBUFFER_TRACE(jh, "on shadow: sleep");
134 jbd_unlock_bh_state(bh);
135 - /* commit wakes up all shadow buffers after IO */
137 - prepare_to_wait(wqh, &wait.wait,
138 - TASK_UNINTERRUPTIBLE);
139 - if (jh->b_jlist != BJ_Shadow)
143 - finish_wait(wqh, &wait.wait);
144 + wait_on_bit(&bh->b_state, BH_Shadow,
145 + sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
149 - /* Only do the copy if the currently-owning transaction
150 - * still needs it. If it is on the Forget list, the
151 - * committing transaction is past that stage. The
152 - * buffer had better remain locked during the kmalloc,
153 - * but that should be true --- we hold the journal lock
154 - * still and the buffer is already on the BUF_JOURNAL
155 - * list so won't be flushed.
157 + * Only do the copy if the currently-owning transaction still
158 + * needs it. If buffer isn't on BJ_Metadata list, the
159 + * committing transaction is past that stage (here we use the
160 + * fact that BH_Shadow is set under bh_state lock together with
161 + * refiling to BJ_Shadow list and at this point we know the
162 + * buffer doesn't have BH_Shadow set).
164 * Subtle point, though: if this is a get_undo_access,
165 * then we will be relying on the frozen_data to contain
166 * the new value of the committed_data record after the
167 * transaction, so we HAVE to force the frozen_data copy
170 - if (jh->b_jlist != BJ_Forget || force_copy) {
173 + if (jh->b_jlist == BJ_Metadata || force_copy) {
174 JBUFFER_TRACE(jh, "generate frozen data");
175 if (!frozen_buffer) {
176 JBUFFER_TRACE(jh, "allocate memory for buffer");
177 diff --git a/include/linux/jbd.h b/include/linux/jbd.h
178 index 7e0b622..92062ee 100644
179 --- a/include/linux/jbd.h
180 +++ b/include/linux/jbd.h
181 @@ -244,6 +244,31 @@ typedef struct journal_superblock_s
183 #include <linux/fs.h>
184 #include <linux/sched.h>
186 +enum jbd_state_bits {
187 + BH_JBD /* Has an attached ext3 journal_head */
189 + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
190 + BH_Freed, /* Has been freed (truncated) */
191 + BH_Revoked, /* Has been revoked from the log */
192 + BH_RevokeValid, /* Revoked flag is valid */
193 + BH_JBDDirty, /* Is dirty but journaled */
194 + BH_State, /* Pins most journal_head state */
195 + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
196 + BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
197 + BH_JBDPrivateStart, /* First bit available for private use by FS */
200 +BUFFER_FNS(JBD, jbd)
201 +BUFFER_FNS(JWrite, jwrite)
202 +BUFFER_FNS(JBDDirty, jbddirty)
203 +TAS_BUFFER_FNS(JBDDirty, jbddirty)
204 +BUFFER_FNS(Revoked, revoked)
205 +TAS_BUFFER_FNS(Revoked, revoked)
206 +BUFFER_FNS(RevokeValid, revokevalid)
207 +TAS_BUFFER_FNS(RevokeValid, revokevalid)
208 +BUFFER_FNS(Freed, freed)
210 #include <linux/jbd_common.h>
212 #define J_ASSERT(assert) BUG_ON(!(assert))
213 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
214 index bdb9ae4..a687c8d 100644
215 --- a/include/linux/jbd2.h
216 +++ b/include/linux/jbd2.h
217 @@ -302,6 +302,34 @@ typedef struct journal_superblock_s
219 #include <linux/fs.h>
220 #include <linux/sched.h>
222 +enum jbd_state_bits {
223 + BH_JBD /* Has an attached ext3 journal_head */
225 + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
226 + BH_Freed, /* Has been freed (truncated) */
227 + BH_Revoked, /* Has been revoked from the log */
228 + BH_RevokeValid, /* Revoked flag is valid */
229 + BH_JBDDirty, /* Is dirty but journaled */
230 + BH_State, /* Pins most journal_head state */
231 + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
232 + BH_Shadow, /* IO on shadow buffer is running */
233 + BH_Verified, /* Metadata block has been verified ok */
234 + BH_JBDPrivateStart, /* First bit available for private use by FS */
237 +BUFFER_FNS(JBD, jbd)
238 +BUFFER_FNS(JWrite, jwrite)
239 +BUFFER_FNS(JBDDirty, jbddirty)
240 +TAS_BUFFER_FNS(JBDDirty, jbddirty)
241 +BUFFER_FNS(Revoked, revoked)
242 +TAS_BUFFER_FNS(Revoked, revoked)
243 +BUFFER_FNS(RevokeValid, revokevalid)
244 +TAS_BUFFER_FNS(RevokeValid, revokevalid)
245 +BUFFER_FNS(Freed, freed)
246 +BUFFER_FNS(Shadow, shadow)
247 +BUFFER_FNS(Verified, verified)
249 #include <linux/jbd_common.h>
251 #define J_ASSERT(assert) BUG_ON(!(assert))
252 diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
253 index 6133679..b1f7089 100644
254 --- a/include/linux/jbd_common.h
255 +++ b/include/linux/jbd_common.h
257 #ifndef _LINUX_JBD_STATE_H
258 #define _LINUX_JBD_STATE_H
260 -enum jbd_state_bits {
261 - BH_JBD /* Has an attached ext3 journal_head */
263 - BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
264 - BH_Freed, /* Has been freed (truncated) */
265 - BH_Revoked, /* Has been revoked from the log */
266 - BH_RevokeValid, /* Revoked flag is valid */
267 - BH_JBDDirty, /* Is dirty but journaled */
268 - BH_State, /* Pins most journal_head state */
269 - BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
270 - BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
271 - BH_Verified, /* Metadata block has been verified ok */
272 - BH_JBDPrivateStart, /* First bit available for private use by FS */
275 -BUFFER_FNS(JBD, jbd)
276 -BUFFER_FNS(JWrite, jwrite)
277 -BUFFER_FNS(JBDDirty, jbddirty)
278 -TAS_BUFFER_FNS(JBDDirty, jbddirty)
279 -BUFFER_FNS(Revoked, revoked)
280 -TAS_BUFFER_FNS(Revoked, revoked)
281 -BUFFER_FNS(RevokeValid, revokevalid)
282 -TAS_BUFFER_FNS(RevokeValid, revokevalid)
283 -BUFFER_FNS(Freed, freed)
284 -BUFFER_FNS(Verified, verified)
286 static inline struct buffer_head *jh2bh(struct journal_head *jh)