jbd2-dio-kjournal-race-EIO.patch

   1 jbd2: Fix jbd2_commit_transaction()/jbd2_journal_try_to_drop_buffers() race
   2
   3 From: Mingming Cao <cmm@us.ibm.com>
   4
   5 There are a few cases direct IO could race with kjournal flushing
   6 data buffers which could result direct IO return EIO error.
   7
   8 1) journal_submit_data_buffers() tries to get bh_state lock. If
   9 try lock fails, it drops the j_list_lock and sleeps for
  10 bh_state lock, while holding a reference on the buffer.
  11 In the meanwhile, journal_try_to_free_buffers() can clean up the
  12 journal head could call try_to_free_buffers(). try_to_free_buffers()
  13 would fail due to the reference held by journal_submit_data_buffers()
  14 - which in turn causes failues for DIO (invalidate_inode_pages2()).
  15
  16 2) When the buffer is on t_locked_list waiting for IO to finish,
  17 we hold a reference and give up the cpu, if we can't get
  18 bh_state lock. This causes try_to_free_buffers() to fail.
  19
  20 3) when journal_submit_data_buffers() saw the buffer is dirty but failed
  21 to lock the buffer bh1, journal_submit_data_buffers() released the
  22 j_list_lock and submit other buffers collected from previous check, with
  23 the reference to bh1 still hold. During this time
  24 journal_try_to_free_buffers() could clean up the journal head of bh1 and
  25 remove it from the t_syncdata_list. Then try_to_free_buffers() would
  26 fail because the reference held by journal_submit_data_buffers()
  27
  28 4) journal_submit_data_buffers() already removed the jh from the bh
  29 (when found the buffers are already being synced), but still keep a
  30 reference to the buffer head. journal_try_to_free_buffers() could be
  31 called. In that case try_to_free_buffers() will be called since there is
  32 no jh related to this buffer, and failed due to
  33 journal_submit_data_buffers() hasn't finish the cleanup business yet.
  34
  35 Fix for first three races is to drop the reference on the buffer head
  36  when release the j_list_lock,
  37 give up the cpu and re-try the whole operation.
  38
  39 This patch also fixes the race that data buffers has been
  40 flushed to disk and journal head is cleard
  41 by journal_submit_data_buffers() but did not get a chance to release
  42  buffer head reference before the journal_try_to_free_buffers() kicked in.
  43
  44
  45 Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
  46 Signed-off-by: Mingming Cao <mcao@us.ibm.com>
  47 ---
  48  fs/jbd2/commit.c      |   23 ++++++++++++++++-------
  49  fs/jbd2/transaction.c |   13 +++++++++++++
  50  2 files changed, 29 insertions(+), 7 deletions(-)
  51
  52 Index: linux-2.6.26-rc2/fs/jbd2/commit.c
  53 ===================================================================
  54 --- linux-2.6.26-rc2.orig/fs/jbd2/commit.c      2008-05-16 11:03:25.000000000 -0700
  55 +++ linux-2.6.26-rc2/fs/jbd2/commit.c   2008-05-16 11:03:27.000000000 -0700
  56 @@ -81,12 +81,16 @@ nope:
  57
  58  /*
  59   * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  60 - * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  61 + * held.  For ranking reasons we must trylock.  If we lose,  unlock the buffer
  62 + * if needed, drop the reference on the buffer, schedule away and
  63   * return 0.  j_list_lock is dropped in this case.
  64   */
  65 -static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  66 +static int inverted_lock(journal_t *journal, struct buffer_head *bh, int locked)
  67  {
  68         if (!jbd_trylock_bh_state(bh)) {
  69 +               if (locked)
  70 +                       unlock_buffer(bh);
  71 +               put_bh(bh);
  72                 spin_unlock(&journal->j_list_lock);
  73                 schedule();
  74                 return 0;
  75 @@ -220,7 +224,7 @@ static int journal_wait_on_locked_list(j
  76                                 ret = -EIO;
  77                         spin_lock(&journal->j_list_lock);
  78                 }
  79 -               if (!inverted_lock(journal, bh)) {
  80 +               if (!inverted_lock(journal, bh, 0)) {
  81                         put_bh(bh);
  82                         spin_lock(&journal->j_list_lock);
  83                         continue;
  84 @@ -290,19 +294,24 @@ write_out_data:
  85                 if (buffer_dirty(bh)) {
  86                         if (test_set_buffer_locked(bh)) {
  87                                 BUFFER_TRACE(bh, "needs blocking lock");
  88 +                               put_bh(bh);
  89                                 spin_unlock(&journal->j_list_lock);
  90                                 /* Write out all data to prevent deadlocks */
  91                                 journal_do_submit_data(wbuf, bufs);
  92                                 bufs = 0;
  93 -                               lock_buffer(bh);
  94                                 spin_lock(&journal->j_list_lock);
  95 +                               continue;
  96                         }
  97                         locked = 1;
  98                 }
  99 -               /* We have to get bh_state lock. Again out of order, sigh. */
 100 -               if (!inverted_lock(journal, bh)) {
 101 -                       jbd_lock_bh_state(bh);
 102 +               /*
 103 +                * We have to get bh_state lock. If the try lock fails,
 104 +                * release the ref on the buffer, give up cpu and retry the
 105 +                * whole operation.
 106 +                */
 107 +               if (!inverted_lock(journal, bh, locked)) {
 108                         spin_lock(&journal->j_list_lock);
 109 +                       continue;
 110                 }
 111                 /* Someone already cleaned up the buffer? */
 112                 if (!buffer_jbd(bh)
 113 Index: linux-2.6.26-rc2/fs/jbd2/transaction.c
 114 ===================================================================
 115 --- linux-2.6.26-rc2.orig/fs/jbd2/transaction.c 2008-05-16 11:03:20.000000000 -0700
 116 +++ linux-2.6.26-rc2/fs/jbd2/transaction.c      2008-05-16 11:03:27.000000000 -0700
 117 @@ -1722,6 +1722,19 @@ int jbd2_journal_try_to_free_buffers(jou
 118                         goto busy;
 119         } while ((bh = bh->b_this_page) != head);
 120         ret = try_to_free_buffers(page);
 121 +       if (ret == 0) {
 122 +               /*
 123 +                * it is possible that journal_submit_data_buffers()
 124 +                * still holds the bh ref even if clears the jh
 125 +                * after journal_remove_journal_head,
 126 +                * which leads to try_to_free_buffers() failed
 127 +                * let's wait for journal_submit_data_buffers()
 128 +                * to finishing remove the bh from the sync_data_list
 129 +                */
 130 +               spin_lock(&journal->j_list_lock);
 131 +               ret = try_to_free_buffers(page);
 132 +               spin_unlock(&journal->j_list_lock);
 133 +       }
 134  busy:
 135         return ret;
 136  }