jbd-dio-kjournal-race-EIO.patch

   1 jbd: Fix jbd_commit_transaction()/journal_try_to_drop_buffers() race
   2
   3 From: Mingming Cao <cmm@us.ibm.com>
   4
   5 There are a few cases direct IO could race with kjournal flushing
   6 data buffers which could result direct IO return EIO error.
   7
   8 1) journal_submit_data_buffers() tries to get bh_state lock. If
   9 try lock fails, it drops the j_list_lock and sleeps for
  10 bh_state lock, while holding a reference on the buffer.
  11 In the meanwhile, journal_try_to_free_buffers() can clean up the
  12 journal head could call try_to_free_buffers(). try_to_free_buffers()
  13 would fail due to the reference held by journal_submit_data_buffers()
  14 - which in turn causes failues for DIO (invalidate_inode_pages2()).
  15
  16 2) When the buffer is on t_locked_list waiting for IO to finish,
  17 we hold a reference and give up the cpu, if we can't get
  18 bh_state lock. This causes try_to_free_buffers() to fail.
  19
  20 3) when journal_submit_data_buffers() saw the buffer is dirty but failed
  21 to lock the buffer bh1, journal_submit_data_buffers() released the
  22 j_list_lock and submit other buffers collected from previous check, with
  23 the reference to bh1 still hold. During this time
  24 journal_try_to_free_buffers() could clean up the journal head of bh1 and
  25 remove it from the t_syncdata_list. Then try_to_free_buffers() would
  26 fail because the reference held by journal_submit_data_buffers()
  27
  28 4) journal_submit_data_buffers() already removed the jh from the bh
  29 (when found the buffers are already being synced), but still keep a
  30 reference to the buffer head. journal_try_to_free_buffers() could be
  31 called. In that case try_to_free_buffers() will be called since there is
  32 no jh related to this buffer, and failed due to
  33 journal_submit_data_buffers() hasn't finish the cleanup business yet.
  34
  35 Fix for first three races is to drop the reference on the buffer head
  36  when release the j_list_lock,
  37 give up the cpu and re-try the whole operation.
  38
  39 This patch also fixes the race that data buffers has been
  40 flushed to disk and journal head is cleard
  41 by journal_submit_data_buffers() but did not get a chance to release
  42  buffer head reference before the journal_try_to_free_buffers() kicked in.
  43
  44
  45 Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
  46 Signed-off-by: Mingming Cao <mcao@us.ibm.com>
  47 ---
  48  fs/jbd/commit.c      |   24 ++++++++++++++++--------
  49  fs/jbd/transaction.c |   13 +++++++++++++
  50  2 files changed, 29 insertions(+), 8 deletions(-)
  51
  52 Index: linux-2.6.26-rc2/fs/jbd/commit.c
  53 ===================================================================
  54 --- linux-2.6.26-rc2.orig/fs/jbd/commit.c       2008-05-16 11:03:20.000000000 -0700
  55 +++ linux-2.6.26-rc2/fs/jbd/commit.c    2008-05-16 11:03:26.000000000 -0700
  56 @@ -79,12 +79,16 @@ nope:
  57
  58  /*
  59   * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  60 - * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  61 + * held.  For ranking reasons we must trylock.  If we lose,  unlock the buffer
  62 + * if needed, drop the reference on the buffer, schedule away and
  63   * return 0.  j_list_lock is dropped in this case.
  64   */
  65 -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  66 +static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
  67  {
  68         if (!jbd_trylock_bh_state(bh)) {
  69 +               if (locked)
  70 +                       unlock_buffer(bh);
  71 +               put_bh(bh);
  72                 spin_unlock(&journal->j_list_lock);
  73                 schedule();
  74                 return 0;
  75 @@ -209,19 +213,24 @@ write_out_data:
  76                 if (buffer_dirty(bh)) {
  77                         if (test_set_buffer_locked(bh)) {
  78                                 BUFFER_TRACE(bh, "needs blocking lock");
  79 +                               put_bh(bh);
  80                                 spin_unlock(&journal->j_list_lock);
  81                                 /* Write out all data to prevent deadlocks */
  82                                 journal_do_submit_data(wbuf, bufs);
  83                                 bufs = 0;
  84 -                               lock_buffer(bh);
  85                                 spin_lock(&journal->j_list_lock);
  86 +                               continue;
  87                         }
  88                         locked = 1;
  89                 }
  90 -               /* We have to get bh_state lock. Again out of order, sigh. */
  91 -               if (!inverted_lock(journal, bh)) {
  92 -                       jbd_lock_bh_state(bh);
  93 +               /*
  94 +                * We have to get bh_state lock. If the try lock fails,
  95 +                * release the ref on the buffer, give up cpu and retry the
  96 +                * whole operation.
  97 +                */
  98 +               if (!inverted_lock(journal, bh, locked)) {
  99                         spin_lock(&journal->j_list_lock);
 100 +                       continue;
 101                 }
 102                 /* Someone already cleaned up the buffer? */
 103                 if (!buffer_jbd(bh)
 104 @@ -430,8 +439,7 @@ void journal_commit_transaction(journal_
 105                                 err = -EIO;
 106                         spin_lock(&journal->j_list_lock);
 107                 }
 108 -               if (!inverted_lock(journal, bh)) {
 109 -                       put_bh(bh);
 110 +               if (!inverted_lock(journal, bh, 0)) {
 111                         spin_lock(&journal->j_list_lock);
 112                         continue;
 113                 }
 114 Index: linux-2.6.26-rc2/fs/jbd/transaction.c
 115 ===================================================================
 116 --- linux-2.6.26-rc2.orig/fs/jbd/transaction.c  2008-05-16 11:03:20.000000000 -0700
 117 +++ linux-2.6.26-rc2/fs/jbd/transaction.c       2008-05-16 11:03:26.000000000 -0700
 118 @@ -1714,6 +1714,19 @@ int journal_try_to_free_buffers(journal_
 119                         goto busy;
 120         } while ((bh = bh->b_this_page) != head);
 121         ret = try_to_free_buffers(page);
 122 +       if (ret == 0) {
 123 +               /*
 124 +                * it is possible that journal_submit_data_buffers()
 125 +                * still holds the bh ref even if clears the jh
 126 +                * after journal_remove_journal_head,
 127 +                * which leads to try_to_free_buffers() failed
 128 +                * let's wait for journal_submit_data_buffers()
 129 +                * to finishing remove the bh from the sync_data_list
 130 +                */
 131 +               spin_lock(&journal->j_list_lock);
 132 +               ret = try_to_free_buffers(page);
 133 +               spin_unlock(&journal->j_list_lock);
 134 +       }
 135  busy:
 136         return ret;
 137  }