Fix delalloc-debug
[ext4-patch-queue.git] / refine-waiting-for-shadow-buffers
blob1bab7a8add48fa3969af069ba91a586260b532c0
1 jbd2: refine waiting for shadow buffers
3 From: Jan Kara <jack@suse.cz>
5 Currently when we add a buffer to a transaction, we wait until the
6 buffer is removed from BJ_Shadow list (so that we prevent any changes
7 to the buffer that is just written to the journal).  This can take
8 unnecessarily long as a lot happens between the time the buffer is
9 submitted to the journal and the time when we remove the buffer from
10 BJ_Shadow list.  (e.g.  We wait for all data buffers in the
11 transaction, we issue a cache flush, etc.)  Also this creates a
12 dependency of do_get_write_access() on transaction commit (namely
13 waiting for data IO to complete) which we want to avoid when
14 implementing transaction reservation.
16 So we modify commit code to set new BH_Shadow flag when temporary
17 shadowing buffer is created and we clear that flag once IO on that
18 buffer is complete.  This allows do_get_write_access() to wait only
19 for BH_Shadow bit and thus removes the dependency on data IO
20 completion.
22 Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
23 Signed-off-by: Jan Kara <jack@suse.cz>
24 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
25 ---
26  fs/jbd2/commit.c           | 18 +++++++++---------
27  fs/jbd2/journal.c          |  2 ++
28  fs/jbd2/transaction.c      | 44 +++++++++++++++++++-------------------------
29  include/linux/jbd.h        | 25 +++++++++++++++++++++++++
30  include/linux/jbd2.h       | 28 ++++++++++++++++++++++++++++
31  include/linux/jbd_common.h | 26 --------------------------
32  6 files changed, 83 insertions(+), 60 deletions(-)
34 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
35 index dd92fc7..b992e16 100644
36 --- a/fs/jbd2/commit.c
37 +++ b/fs/jbd2/commit.c
38 @@ -30,15 +30,22 @@
39  #include <trace/events/jbd2.h>
41  /*
42 - * Default IO end handler for temporary BJ_IO buffer_heads.
43 + * IO end handler for temporary buffer_heads handling writes to the journal.
44   */
45  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
46  {
47 +       struct buffer_head *orig_bh = bh->b_private;
49         BUFFER_TRACE(bh, "");
50         if (uptodate)
51                 set_buffer_uptodate(bh);
52         else
53                 clear_buffer_uptodate(bh);
54 +       if (orig_bh) {
55 +               clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
56 +               smp_mb__after_clear_bit();
57 +               wake_up_bit(&orig_bh->b_state, BH_Shadow);
58 +       }
59         unlock_buffer(bh);
60  }
62 @@ -831,6 +838,7 @@ start_journal_io:
63                 bh = jh2bh(jh);
64                 clear_buffer_jwrite(bh);
65                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
66 +               J_ASSERT_BH(bh, !buffer_shadow(bh));
68                 /* The metadata is now released for reuse, but we need
69                     to remember it against this transaction so that when
70 @@ -838,14 +846,6 @@ start_journal_io:
71                     required. */
72                 JBUFFER_TRACE(jh, "file as BJ_Forget");
73                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
74 -               /*
75 -                * Wake up any transactions which were waiting for this IO to
76 -                * complete. The barrier must be here so that changes by
77 -                * jbd2_journal_file_buffer() take effect before wake_up_bit()
78 -                * does the waitqueue check.
79 -                */
80 -               smp_mb();
81 -               wake_up_bit(&bh->b_state, BH_Unshadow);
82                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
83                 __brelse(bh);
84         }
85 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
86 index 96e0594..e812030 100644
87 --- a/fs/jbd2/journal.c
88 +++ b/fs/jbd2/journal.c
89 @@ -451,6 +451,7 @@ repeat:
90         new_bh->b_size = bh_in->b_size;
91         new_bh->b_bdev = journal->j_dev;
92         new_bh->b_blocknr = blocknr;
93 +       new_bh->b_private = bh_in;
94         set_buffer_mapped(new_bh);
95         set_buffer_dirty(new_bh);
97 @@ -465,6 +466,7 @@ repeat:
98         spin_lock(&journal->j_list_lock);
99         __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
100         spin_unlock(&journal->j_list_lock);
101 +       set_buffer_shadow(bh_in);
102         jbd_unlock_bh_state(bh_in);
104         return do_escape | (done_copy_out << 1);
105 diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
106 index 213a43b..4d5ef4b 100644
107 --- a/fs/jbd2/transaction.c
108 +++ b/fs/jbd2/transaction.c
109 @@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
110                bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
113 +static int sleep_on_shadow_bh(void *word)
115 +       io_schedule();
116 +       return 0;
119  /*
120   * If the buffer is already part of the current transaction, then there
121   * is nothing we need to do.  If it is already part of a prior
122 @@ -754,41 +760,29 @@ repeat:
123                  * journaled.  If the primary copy is already going to
124                  * disk then we cannot do copy-out here. */
126 -               if (jh->b_jlist == BJ_Shadow) {
127 -                       DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
128 -                       wait_queue_head_t *wqh;
130 -                       wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
132 +               if (buffer_shadow(bh)) {
133                         JBUFFER_TRACE(jh, "on shadow: sleep");
134                         jbd_unlock_bh_state(bh);
135 -                       /* commit wakes up all shadow buffers after IO */
136 -                       for ( ; ; ) {
137 -                               prepare_to_wait(wqh, &wait.wait,
138 -                                               TASK_UNINTERRUPTIBLE);
139 -                               if (jh->b_jlist != BJ_Shadow)
140 -                                       break;
141 -                               schedule();
142 -                       }
143 -                       finish_wait(wqh, &wait.wait);
144 +                       wait_on_bit(&bh->b_state, BH_Shadow,
145 +                                   sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
146                         goto repeat;
147                 }
149 -               /* Only do the copy if the currently-owning transaction
150 -                * still needs it.  If it is on the Forget list, the
151 -                * committing transaction is past that stage.  The
152 -                * buffer had better remain locked during the kmalloc,
153 -                * but that should be true --- we hold the journal lock
154 -                * still and the buffer is already on the BUF_JOURNAL
155 -                * list so won't be flushed.
156 +               /*
157 +                * Only do the copy if the currently-owning transaction still
158 +                * needs it. If buffer isn't on BJ_Metadata list, the
159 +                * committing transaction is past that stage (here we use the
160 +                * fact that BH_Shadow is set under bh_state lock together with
161 +                * refiling to BJ_Shadow list and at this point we know the
162 +                * buffer doesn't have BH_Shadow set).
163                  *
164                  * Subtle point, though: if this is a get_undo_access,
165                  * then we will be relying on the frozen_data to contain
166                  * the new value of the committed_data record after the
167                  * transaction, so we HAVE to force the frozen_data copy
168 -                * in that case. */
170 -               if (jh->b_jlist != BJ_Forget || force_copy) {
171 +                * in that case.
172 +                */
173 +               if (jh->b_jlist == BJ_Metadata || force_copy) {
174                         JBUFFER_TRACE(jh, "generate frozen data");
175                         if (!frozen_buffer) {
176                                 JBUFFER_TRACE(jh, "allocate memory for buffer");
177 diff --git a/include/linux/jbd.h b/include/linux/jbd.h
178 index 7e0b622..92062ee 100644
179 --- a/include/linux/jbd.h
180 +++ b/include/linux/jbd.h
181 @@ -244,6 +244,31 @@ typedef struct journal_superblock_s
183  #include <linux/fs.h>
184  #include <linux/sched.h>
186 +enum jbd_state_bits {
187 +       BH_JBD                  /* Has an attached ext3 journal_head */
188 +         = BH_PrivateStart,
189 +       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
190 +       BH_Freed,               /* Has been freed (truncated) */
191 +       BH_Revoked,             /* Has been revoked from the log */
192 +       BH_RevokeValid,         /* Revoked flag is valid */
193 +       BH_JBDDirty,            /* Is dirty but journaled */
194 +       BH_State,               /* Pins most journal_head state */
195 +       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
196 +       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
197 +       BH_JBDPrivateStart,     /* First bit available for private use by FS */
200 +BUFFER_FNS(JBD, jbd)
201 +BUFFER_FNS(JWrite, jwrite)
202 +BUFFER_FNS(JBDDirty, jbddirty)
203 +TAS_BUFFER_FNS(JBDDirty, jbddirty)
204 +BUFFER_FNS(Revoked, revoked)
205 +TAS_BUFFER_FNS(Revoked, revoked)
206 +BUFFER_FNS(RevokeValid, revokevalid)
207 +TAS_BUFFER_FNS(RevokeValid, revokevalid)
208 +BUFFER_FNS(Freed, freed)
210  #include <linux/jbd_common.h>
212  #define J_ASSERT(assert)       BUG_ON(!(assert))
213 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
214 index bdb9ae4..a687c8d 100644
215 --- a/include/linux/jbd2.h
216 +++ b/include/linux/jbd2.h
217 @@ -302,6 +302,34 @@ typedef struct journal_superblock_s
219  #include <linux/fs.h>
220  #include <linux/sched.h>
222 +enum jbd_state_bits {
223 +       BH_JBD                  /* Has an attached ext3 journal_head */
224 +         = BH_PrivateStart,
225 +       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
226 +       BH_Freed,               /* Has been freed (truncated) */
227 +       BH_Revoked,             /* Has been revoked from the log */
228 +       BH_RevokeValid,         /* Revoked flag is valid */
229 +       BH_JBDDirty,            /* Is dirty but journaled */
230 +       BH_State,               /* Pins most journal_head state */
231 +       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
232 +       BH_Shadow,              /* IO on shadow buffer is running */
233 +       BH_Verified,            /* Metadata block has been verified ok */
234 +       BH_JBDPrivateStart,     /* First bit available for private use by FS */
237 +BUFFER_FNS(JBD, jbd)
238 +BUFFER_FNS(JWrite, jwrite)
239 +BUFFER_FNS(JBDDirty, jbddirty)
240 +TAS_BUFFER_FNS(JBDDirty, jbddirty)
241 +BUFFER_FNS(Revoked, revoked)
242 +TAS_BUFFER_FNS(Revoked, revoked)
243 +BUFFER_FNS(RevokeValid, revokevalid)
244 +TAS_BUFFER_FNS(RevokeValid, revokevalid)
245 +BUFFER_FNS(Freed, freed)
246 +BUFFER_FNS(Shadow, shadow)
247 +BUFFER_FNS(Verified, verified)
249  #include <linux/jbd_common.h>
251  #define J_ASSERT(assert)       BUG_ON(!(assert))
252 diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h
253 index 6133679..b1f7089 100644
254 --- a/include/linux/jbd_common.h
255 +++ b/include/linux/jbd_common.h
256 @@ -1,32 +1,6 @@
257  #ifndef _LINUX_JBD_STATE_H
258  #define _LINUX_JBD_STATE_H
260 -enum jbd_state_bits {
261 -       BH_JBD                  /* Has an attached ext3 journal_head */
262 -         = BH_PrivateStart,
263 -       BH_JWrite,              /* Being written to log (@@@ DEBUGGING) */
264 -       BH_Freed,               /* Has been freed (truncated) */
265 -       BH_Revoked,             /* Has been revoked from the log */
266 -       BH_RevokeValid,         /* Revoked flag is valid */
267 -       BH_JBDDirty,            /* Is dirty but journaled */
268 -       BH_State,               /* Pins most journal_head state */
269 -       BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
270 -       BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
271 -       BH_Verified,            /* Metadata block has been verified ok */
272 -       BH_JBDPrivateStart,     /* First bit available for private use by FS */
275 -BUFFER_FNS(JBD, jbd)
276 -BUFFER_FNS(JWrite, jwrite)
277 -BUFFER_FNS(JBDDirty, jbddirty)
278 -TAS_BUFFER_FNS(JBDDirty, jbddirty)
279 -BUFFER_FNS(Revoked, revoked)
280 -TAS_BUFFER_FNS(Revoked, revoked)
281 -BUFFER_FNS(RevokeValid, revokevalid)
282 -TAS_BUFFER_FNS(RevokeValid, revokevalid)
283 -BUFFER_FNS(Freed, freed)
284 -BUFFER_FNS(Verified, verified)
286  static inline struct buffer_head *jh2bh(struct journal_head *jh)
288         return jh->b_bh;
289 -- 
290 1.8.1.4