ext4-handle-page-writhout-buffers-in-da-writepage.patch

   1 ext4: Handle page without buffers in ext4_*_writepage()
   2
   3 From: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
   4
   5 It can happen that buffers are removed from the page before it gets
   6 marked dirty and then is passed to writepage().  In writepage() we just
   7 initialize the buffers and check whether they are mapped and non
   8 delay. If they are mapped and non delay we write the page. Otherwise we
   9 mark them dirty.  With this change we don't do block allocation at all
  10 in ext4_*_write_page.
  11
  12 writepage() can get called under many condition and with a locking order
  13 of journal_start -> lock_page, we should not try to allocate blocks in
  14 writepage() which get called after taking page lock.  writepage() can
  15 get called via shrink_page_list even with a journal handle which was
  16 created for doing inode update.  For example when doing
  17 ext4_da_write_begin we create a journal handle with credit 1 expecting a
  18 i_disksize update for the inode. But ext4_da_write_begin can cause
  19 shrink_page_list via _grab_page_cache. So having a valid handle via
  20 ext4_journal_current_handle is not a guarantee that we can use the
  21 handle for block allocation in writepage, since we shouldn't be using
  22 credits that had been reserved for other updates.  That it could result
  23 in we running out of credits when we update inodes.
  24
  25 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
  26 Signed-off-by: Mingming Cao <cmm@us.ibm.com>
  27 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  28 ---
  29  fs/ext4/inode.c |  169 +++++++++++++++++++++++++++++++++++++++++---------------
  30  1 file changed, 124 insertions(+), 45 deletions(-)
  31
  32 Index: linux-2.6.26-rc8/fs/ext4/inode.c
  33 ===================================================================
  34 --- linux-2.6.26-rc8.orig/fs/ext4/inode.c       2008-06-30 11:49:49.000000000 -0700
  35 +++ linux-2.6.26-rc8/fs/ext4/inode.c    2008-06-30 11:49:56.000000000 -0700
  36 @@ -1592,11 +1592,15 @@ static int ext4_da_get_block_write(struc
  37         handle_t *handle = NULL;
  38
  39         handle = ext4_journal_current_handle();
  40 -       BUG_ON(handle == NULL);
  41 -       BUG_ON(create == 0);
  42 -
  43 -       ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
  44 +       if (!handle) {
  45 +               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
  46 +                                  bh_result, 0, 0, 0);
  47 +               BUG_ON(!ret);
  48 +       } else {
  49 +               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
  50                                    bh_result, create, 0, EXT4_DELALLOC_RSVED);
  51 +       }
  52 +
  53         if (ret > 0) {
  54                 bh_result->b_size = (ret << inode->i_blkbits);
  55
  56 @@ -1634,15 +1638,37 @@ static int ext4_da_get_block_write(struc
  57
  58  static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
  59  {
  60 -       return !buffer_mapped(bh) || buffer_delay(bh);
  61 +       /*
  62 +        * unmapped buffer is possible for holes.
  63 +        * delay buffer is possible with delayed allocation
  64 +        */
  65 +       return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
  66 +}
  67 +
  68 +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
  69 +                                  struct buffer_head *bh_result, int create)
  70 +{
  71 +       int ret = 0;
  72 +       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
  73 +
  74 +       /*
  75 +        * we don't want to do block allocation in writepage
  76 +        * so call get_block_wrap with create = 0
  77 +        */
  78 +       ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
  79 +                                  bh_result, 0, 0, 0);
  80 +       if (ret > 0) {
  81 +               bh_result->b_size = (ret << inode->i_blkbits);
  82 +               ret = 0;
  83 +       }
  84 +       return ret;
  85  }
  86
  87  /*
  88 - * get called vi ext4_da_writepages after taking page lock
  89 - * We may end up doing block allocation here in case
  90 - * mpage_da_map_blocks failed to allocate blocks.
  91 - *
  92 - * We also get called via journal_submit_inode_data_buffers
  93 + * get called vi ext4_da_writepages after taking page lock (have journal handle)
  94 + * get called via journal_submit_inode_data_buffers (no journal handle)
  95 + * get called via shrink_page_list via pdflush (no journal handle)
  96 + * or grab_page_cache when doing write_begin (have journal handle)
  97   */
  98  static int ext4_da_writepage(struct page *page,
  99                                 struct writeback_control *wbc)
 100 @@ -1650,37 +1676,61 @@ static int ext4_da_writepage(struct page
 101         int ret = 0;
 102         loff_t size;
 103         unsigned long len;
 104 -       handle_t *handle = NULL;
 105         struct buffer_head *page_bufs;
 106         struct inode *inode = page->mapping->host;
 107
 108 -       handle = ext4_journal_current_handle();
 109 -       if (!handle) {
 110 -               /*
 111 -                * This can happen when we aren't called via
 112 -                * ext4_da_writepages() but directly (shrink_page_list).
 113 -                * We cannot easily start a transaction here so we just skip
 114 -                * writing the page in case we would have to do so.
 115 -                * We reach here also via journal_submit_inode_data_buffers
 116 -                */
 117 -               size = i_size_read(inode);
 118 +       size = i_size_read(inode);
 119 +       if (page->index == size >> PAGE_CACHE_SHIFT)
 120 +               len = size & ~PAGE_CACHE_MASK;
 121 +       else
 122 +               len = PAGE_CACHE_SIZE;
 123
 124 +       if (page_has_buffers(page)) {
 125                 page_bufs = page_buffers(page);
 126 -               if (page->index == size >> PAGE_CACHE_SHIFT)
 127 -                       len = size & ~PAGE_CACHE_MASK;
 128 -               else
 129 -                       len = PAGE_CACHE_SIZE;
 130 -
 131 -               if (walk_page_buffers(NULL, page_bufs, 0,
 132 -                               len, NULL, ext4_bh_unmapped_or_delay)) {
 133 +               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 134 +                                       ext4_bh_unmapped_or_delay)) {
 135                         /*
 136 -                        * We can't do block allocation under
 137 -                        * page lock without a handle . So redirty
 138 -                        * the page and return
 139 +                        * We don't want to do  block allocation
 140 +                        * So redirty the page and return
 141                          * We may reach here when we do a journal commit
 142                          * via journal_submit_inode_data_buffers.
 143                          * If we don't have mapping block we just ignore
 144 -                        * them
 145 +                        * them. We can also reach here via shrink_page_list
 146 +                        */
 147 +                       redirty_page_for_writepage(wbc, page);
 148 +                       unlock_page(page);
 149 +                       return 0;
 150 +               }
 151 +       } else {
 152 +               /*
 153 +                * The test for page_has_buffers() is subtle:
 154 +                * We know the page is dirty but it lost buffers. That means
 155 +                * that at some moment in time after write_begin()/write_end()
 156 +                * has been called all buffers have been clean and thus they
 157 +                * must have been written at least once. So they are all
 158 +                * mapped and we can happily proceed with mapping them
 159 +                * and writing the page.
 160 +                *
 161 +                * Try to initialize the buffer_heads and check whether
 162 +                * all are mapped and non delay. We don't want to
 163 +                * do block allocation here.
 164 +                */
 165 +               ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
 166 +                                               ext4_normal_get_block_write);
 167 +               if (!ret) {
 168 +                       page_bufs = page_buffers(page);
 169 +                       /* check whether all are mapped and non delay */
 170 +                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
 171 +                                               ext4_bh_unmapped_or_delay)) {
 172 +                               redirty_page_for_writepage(wbc, page);
 173 +                               unlock_page(page);
 174 +                               return 0;
 175 +                       }
 176 +               } else {
 177 +                       /*
 178 +                        * We can't do block allocation here
 179 +                        * so just redity the page and unlock
 180 +                        * and return
 181                          */
 182                         redirty_page_for_writepage(wbc, page);
 183                         unlock_page(page);
 184 @@ -1689,9 +1739,11 @@ static int ext4_da_writepage(struct page
 185         }
 186
 187         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 188 -               ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
 189 +               ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
 190         else
 191 -               ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
 192 +               ret = block_write_full_page(page,
 193 +                                               ext4_normal_get_block_write,
 194 +                                               wbc);
 195
 196         return ret;
 197  }
 198 @@ -2032,12 +2084,14 @@ static int __ext4_normal_writepage(struc
 199         struct inode *inode = page->mapping->host;
 200
 201         if (test_opt(inode->i_sb, NOBH))
 202 -               return nobh_writepage(page, ext4_get_block, wbc);
 203 +               return nobh_writepage(page,
 204 +                                       ext4_normal_get_block_write, wbc);
 205         else
 206 -               return block_write_full_page(page, ext4_get_block, wbc);
 207 +               return block_write_full_page(page,
 208 +                                               ext4_normal_get_block_write,
 209 +                                               wbc);
 210  }
 211
 212 -
 213  static int ext4_normal_writepage(struct page *page,
 214                                 struct writeback_control *wbc)
 215  {
 216 @@ -2046,13 +2100,24 @@ static int ext4_normal_writepage(struct
 217         loff_t len;
 218
 219         J_ASSERT(PageLocked(page));
 220 -       J_ASSERT(page_has_buffers(page));
 221         if (page->index == size >> PAGE_CACHE_SHIFT)
 222                 len = size & ~PAGE_CACHE_MASK;
 223         else
 224                 len = PAGE_CACHE_SIZE;
 225 -       BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 226 -                                ext4_bh_unmapped_or_delay));
 227 +
 228 +       if (page_has_buffers(page)) {
 229 +               /* if page has buffers it should all be mapped
 230 +                * and allocated. If there are not buffers attached
 231 +                * to the page we know the page is dirty but it lost
 232 +                * buffers. That means that at some moment in time
 233 +                * after write_begin() / write_end() has been called
 234 +                * all buffers have been clean and thus they must have been
 235 +                * written at least once. So they are all mapped and we can
 236 +                * happily proceed with mapping them and writing the page.
 237 +                */
 238 +               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 239 +                                       ext4_bh_unmapped_or_delay));
 240 +       }
 241
 242         if (!ext4_journal_current_handle())
 243                 return __ext4_normal_writepage(page, wbc);
 244 @@ -2072,7 +2137,8 @@ static int __ext4_journalled_writepage(s
 245         int ret = 0;
 246         int err;
 247
 248 -       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
 249 +       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
 250 +                                       ext4_normal_get_block_write);
 251         if (ret != 0)
 252                 goto out_unlock;
 253
 254 @@ -2119,13 +2185,24 @@ static int ext4_journalled_writepage(str
 255         loff_t len;
 256
 257         J_ASSERT(PageLocked(page));
 258 -       J_ASSERT(page_has_buffers(page));
 259         if (page->index == size >> PAGE_CACHE_SHIFT)
 260                 len = size & ~PAGE_CACHE_MASK;
 261         else
 262                 len = PAGE_CACHE_SIZE;
 263 -       BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 264 -                                ext4_bh_unmapped_or_delay));
 265 +
 266 +       if (page_has_buffers(page)) {
 267 +               /* if page has buffers it should all be mapped
 268 +                * and allocated. If there are not buffers attached
 269 +                * to the page we know the page is dirty but it lost
 270 +                * buffers. That means that at some moment in time
 271 +                * after write_begin() / write_end() has been called
 272 +                * all buffers have been clean and thus they must have been
 273 +                * written at least once. So they are all mapped and we can
 274 +                * happily proceed with mapping them and writing the page.
 275 +                */
 276 +               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 277 +                                       ext4_bh_unmapped_or_delay));
 278 +       }
 279
 280         if (ext4_journal_current_handle())
 281                 goto no_write;
 282 @@ -2143,7 +2220,9 @@ static int ext4_journalled_writepage(str
 283                  * really know unless we go poke around in the buffer_heads.
 284                  * But block_write_full_page will do the right thing.
 285                  */
 286 -               return block_write_full_page(page, ext4_get_block, wbc);
 287 +               return block_write_full_page(page,
 288 +                                               ext4_normal_get_block_write,
 289 +                                               wbc);
 290         }
 291  no_write:
 292         redirty_page_for_writepage(wbc, page);