restructure-writeback-path

   1 ext4: restructure writeback path
   2
   3 From: Jan Kara <jack@suse.cz>
   4
   5 There are two issues with current writeback path in ext4.  For one we
   6 don't necessarily map complete pages when blocksize < pagesize and
   7 thus needn't do any writeback in one iteration.  We always map some
   8 blocks though so we will eventually finish mapping the page.  Just if
   9 writeback races with other operations on the file, forward progress is
  10 not really guaranteed. The second problem is that current code
  11 structure makes it hard to associate all the bios to some range of
  12 pages with one io_end structure so that unwritten extents can be
  13 converted after all the bios are finished.  This will be especially
  14 difficult later when io_end will be associated with reserved
  15 transaction handle.
  16
  17 We restructure the writeback path to a relatively simple loop which
  18 first prepares extent of pages, then maps one or more extents so that
  19 no page is partially mapped, and once page is fully mapped it is
  20 submitted for IO. We keep all the mapping and IO submission
  21 information in mpage_da_data structure to somewhat reduce stack usage.
  22 Resulting code is somewhat shorter than the old one and hopefully also
  23 easier to read.
  24
  25 Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
  26 Signed-off-by: Jan Kara <jack@suse.cz>
  27 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  28 ---
  29  fs/ext4/ext4.h              |   15 -
  30  fs/ext4/inode.c             | 1011 ++++++++++++++++++++++++++++++++----------------------------------
  31  fs/ext4/page-io.c           |    4 -
  32  include/trace/events/ext4.h |   64 +++--
  33  4 files changed, 527 insertions(+), 567 deletions(-)
  34
  35 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
  36 index 2ebfcde..90a164f 100644
  37 --- a/fs/ext4/ext4.h
  38 +++ b/fs/ext4/ext4.h
  39 @@ -177,21 +177,6 @@ struct ext4_map_blocks {
  40  };
  41
  42  /*
  43 - * For delayed allocation tracking
  44 - */
  45 -struct mpage_da_data {
  46 -       struct inode *inode;
  47 -       sector_t b_blocknr;             /* start block number of extent */
  48 -       size_t b_size;                  /* size of extent */
  49 -       unsigned long b_state;          /* state of the extent */
  50 -       unsigned long first_page, next_page;    /* extent of pages */
  51 -       struct writeback_control *wbc;
  52 -       int io_done;
  53 -       int pages_written;
  54 -       int retval;
  55 -};
  56 -
  57 -/*
  58   * Flags for ext4_io_end->flags
  59   */
  60  #define        EXT4_IO_END_UNWRITTEN   0x0001
  61 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
  62 index 2b777e5..5939a47 100644
  63 --- a/fs/ext4/inode.c
  64 +++ b/fs/ext4/inode.c
  65 @@ -1407,149 +1407,42 @@ static void ext4_da_page_release_reservation(struct page *page,
  66   * Delayed allocation stuff
  67   */
  68
  69 -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
  70 -
  71 -/*
  72 - * mpage_da_submit_io - walks through extent of pages and try to write
  73 - * them with writepage() call back
  74 - *
  75 - * @mpd->inode: inode
  76 - * @mpd->first_page: first page of the extent
  77 - * @mpd->next_page: page after the last page of the extent
  78 - *
  79 - * By the time mpage_da_submit_io() is called we expect all blocks
  80 - * to be allocated. this may be wrong if allocation failed.
  81 - *
  82 - * As pages are already locked by write_cache_pages(), we can't use it
  83 - */
  84 -static int mpage_da_submit_io(struct mpage_da_data *mpd,
  85 -                             struct ext4_map_blocks *map)
  86 -{
  87 -       struct pagevec pvec;
  88 -       unsigned long index, end;
  89 -       int ret = 0, err, nr_pages, i;
  90 -       struct inode *inode = mpd->inode;
  91 -       struct address_space *mapping = inode->i_mapping;
  92 -       loff_t size = i_size_read(inode);
  93 -       unsigned int len, block_start;
  94 -       struct buffer_head *bh, *page_bufs = NULL;
  95 -       sector_t pblock = 0, cur_logical = 0;
  96 -       struct ext4_io_submit io_submit;
  97 -
  98 -       BUG_ON(mpd->next_page <= mpd->first_page);
  99 -       ext4_io_submit_init(&io_submit, mpd->wbc);
 100 -       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
 101 -       if (!io_submit.io_end) {
 102 -               ext4_da_block_invalidatepages(mpd);
 103 -               return -ENOMEM;
 104 -       }
 105 +struct mpage_da_data {
 106 +       struct inode *inode;
 107 +       struct writeback_control *wbc;
 108 +       pgoff_t first_page;     /* The first page to write */
 109 +       pgoff_t next_page;      /* Current page to examine */
 110 +       pgoff_t last_page;      /* Last page to examine */
 111         /*
 112 -        * We need to start from the first_page to the next_page - 1
 113 -        * to make sure we also write the mapped dirty buffer_heads.
 114 -        * If we look at mpd->b_blocknr we would only be looking
 115 -        * at the currently mapped buffer_heads.
 116 +        * Extent to map - this can be after first_page because that can be
 117 +        * fully mapped. We somewhat abuse m_flags to store whether the extent
 118 +        * is delalloc or unwritten.
 119          */
 120 -       index = mpd->first_page;
 121 -       end = mpd->next_page - 1;
 122 -
 123 -       pagevec_init(&pvec, 0);
 124 -       while (index <= end) {
 125 -               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 126 -               if (nr_pages == 0)
 127 -                       break;
 128 -               for (i = 0; i < nr_pages; i++) {
 129 -                       int skip_page = 0;
 130 -                       struct page *page = pvec.pages[i];
 131 -
 132 -                       index = page->index;
 133 -                       if (index > end)
 134 -                               break;
 135 -
 136 -                       if (index == size >> PAGE_CACHE_SHIFT)
 137 -                               len = size & ~PAGE_CACHE_MASK;
 138 -                       else
 139 -                               len = PAGE_CACHE_SIZE;
 140 -                       if (map) {
 141 -                               cur_logical = index << (PAGE_CACHE_SHIFT -
 142 -                                                       inode->i_blkbits);
 143 -                               pblock = map->m_pblk + (cur_logical -
 144 -                                                       map->m_lblk);
 145 -                       }
 146 -                       index++;
 147 -
 148 -                       BUG_ON(!PageLocked(page));
 149 -                       BUG_ON(PageWriteback(page));
 150 -
 151 -                       bh = page_bufs = page_buffers(page);
 152 -                       block_start = 0;
 153 -                       do {
 154 -                               if (map && (cur_logical >= map->m_lblk) &&
 155 -                                   (cur_logical <= (map->m_lblk +
 156 -                                                    (map->m_len - 1)))) {
 157 -                                       if (buffer_delay(bh)) {
 158 -                                               clear_buffer_delay(bh);
 159 -                                               bh->b_blocknr = pblock;
 160 -                                       }
 161 -                                       if (buffer_unwritten(bh) ||
 162 -                                           buffer_mapped(bh))
 163 -                                               BUG_ON(bh->b_blocknr != pblock);
 164 -                                       if (map->m_flags & EXT4_MAP_UNINIT)
 165 -                                               set_buffer_uninit(bh);
 166 -                                       clear_buffer_unwritten(bh);
 167 -                               }
 168 -
 169 -                               /*
 170 -                                * skip page if block allocation undone and
 171 -                                * block is dirty
 172 -                                */
 173 -                               if (ext4_bh_delay_or_unwritten(NULL, bh))
 174 -                                       skip_page = 1;
 175 -                               bh = bh->b_this_page;
 176 -                               block_start += bh->b_size;
 177 -                               cur_logical++;
 178 -                               pblock++;
 179 -                       } while (bh != page_bufs);
 180 -
 181 -                       if (skip_page) {
 182 -                               unlock_page(page);
 183 -                               continue;
 184 -                       }
 185 -
 186 -                       clear_page_dirty_for_io(page);
 187 -                       err = ext4_bio_write_page(&io_submit, page, len,
 188 -                                                 mpd->wbc);
 189 -                       if (!err)
 190 -                               mpd->pages_written++;
 191 -                       /*
 192 -                        * In error case, we have to continue because
 193 -                        * remaining pages are still locked
 194 -                        */
 195 -                       if (ret == 0)
 196 -                               ret = err;
 197 -               }
 198 -               pagevec_release(&pvec);
 199 -       }
 200 -       ext4_io_submit(&io_submit);
 201 -       /* Drop io_end reference we got from init */
 202 -       ext4_put_io_end_defer(io_submit.io_end);
 203 -       return ret;
 204 -}
 205 +       struct ext4_map_blocks map;
 206 +       struct ext4_io_submit io_submit;        /* IO submission data */
 207 +};
 208
 209 -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 210 +static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 211 +                                      bool invalidate)
 212  {
 213         int nr_pages, i;
 214         pgoff_t index, end;
 215         struct pagevec pvec;
 216         struct inode *inode = mpd->inode;
 217         struct address_space *mapping = inode->i_mapping;
 218 -       ext4_lblk_t start, last;
 219 +
 220 +       /* This is necessary when next_page == 0. */
 221 +       if (mpd->first_page >= mpd->next_page)
 222 +               return;
 223
 224         index = mpd->first_page;
 225         end   = mpd->next_page - 1;
 226 -
 227 -       start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 228 -       last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 229 -       ext4_es_remove_extent(inode, start, last - start + 1);
 230 +       if (invalidate) {
 231 +               ext4_lblk_t start, last;
 232 +               start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 233 +               last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 234 +               ext4_es_remove_extent(inode, start, last - start + 1);
 235 +       }
 236
 237         pagevec_init(&pvec, 0);
 238         while (index <= end) {
 239 @@ -1562,14 +1455,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 240                                 break;
 241                         BUG_ON(!PageLocked(page));
 242                         BUG_ON(PageWriteback(page));
 243 -                       block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 244 -                       ClearPageUptodate(page);
 245 +                       if (invalidate) {
 246 +                               block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 247 +                               ClearPageUptodate(page);
 248 +                       }
 249                         unlock_page(page);
 250                 }
 251                 index = pvec.pages[nr_pages - 1]->index + 1;
 252                 pagevec_release(&pvec);
 253         }
 254 -       return;
 255  }
 256
 257  static void ext4_print_free_blocks(struct inode *inode)
 258 @@ -1598,215 +1492,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 259         return;
 260  }
 261
 262 -/*
 263 - * mpage_da_map_and_submit - go through given space, map them
 264 - *       if necessary, and then submit them for I/O
 265 - *
 266 - * @mpd - bh describing space
 267 - *
 268 - * The function skips space we know is already mapped to disk blocks.
 269 - *
 270 - */
 271 -static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
 272 -{
 273 -       int err, blks, get_blocks_flags;
 274 -       struct ext4_map_blocks map, *mapp = NULL;
 275 -       sector_t next = mpd->b_blocknr;
 276 -       unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 277 -       loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
 278 -       handle_t *handle = NULL;
 279 -
 280 -       /*
 281 -        * If the blocks are mapped already, or we couldn't accumulate
 282 -        * any blocks, then proceed immediately to the submission stage.
 283 -        */
 284 -       if ((mpd->b_size == 0) ||
 285 -           ((mpd->b_state  & (1 << BH_Mapped)) &&
 286 -            !(mpd->b_state & (1 << BH_Delay)) &&
 287 -            !(mpd->b_state & (1 << BH_Unwritten))))
 288 -               goto submit_io;
 289 -
 290 -       handle = ext4_journal_current_handle();
 291 -       BUG_ON(!handle);
 292 -
 293 -       /*
 294 -        * Call ext4_map_blocks() to allocate any delayed allocation
 295 -        * blocks, or to convert an uninitialized extent to be
 296 -        * initialized (in the case where we have written into
 297 -        * one or more preallocated blocks).
 298 -        *
 299 -        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
 300 -        * indicate that we are on the delayed allocation path.  This
 301 -        * affects functions in many different parts of the allocation
 302 -        * call path.  This flag exists primarily because we don't
 303 -        * want to change *many* call functions, so ext4_map_blocks()
 304 -        * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
 305 -        * inode's allocation semaphore is taken.
 306 -        *
 307 -        * If the blocks in questions were delalloc blocks, set
 308 -        * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
 309 -        * variables are updated after the blocks have been allocated.
 310 -        */
 311 -       map.m_lblk = next;
 312 -       map.m_len = max_blocks;
 313 -       /*
 314 -        * We're in delalloc path and it is possible that we're going to
 315 -        * need more metadata blocks than previously reserved. However
 316 -        * we must not fail because we're in writeback and there is
 317 -        * nothing we can do about it so it might result in data loss.
 318 -        * So use reserved blocks to allocate metadata if possible.
 319 -        */
 320 -       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
 321 -                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
 322 -       if (ext4_should_dioread_nolock(mpd->inode))
 323 -               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 324 -       if (mpd->b_state & (1 << BH_Delay))
 325 -               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 326 -
 327 -
 328 -       blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 329 -       if (blks < 0) {
 330 -               struct super_block *sb = mpd->inode->i_sb;
 331 -
 332 -               err = blks;
 333 -               /*
 334 -                * If get block returns EAGAIN or ENOSPC and there
 335 -                * appears to be free blocks we will just let
 336 -                * mpage_da_submit_io() unlock all of the pages.
 337 -                */
 338 -               if (err == -EAGAIN)
 339 -                       goto submit_io;
 340 -
 341 -               if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
 342 -                       mpd->retval = err;
 343 -                       goto submit_io;
 344 -               }
 345 -
 346 -               /*
 347 -                * get block failure will cause us to loop in
 348 -                * writepages, because a_ops->writepage won't be able
 349 -                * to make progress. The page will be redirtied by
 350 -                * writepage and writepages will again try to write
 351 -                * the same.
 352 -                */
 353 -               if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 354 -                       ext4_msg(sb, KERN_CRIT,
 355 -                                "delayed block allocation failed for inode %lu "
 356 -                                "at logical offset %llu with max blocks %zd "
 357 -                                "with error %d", mpd->inode->i_ino,
 358 -                                (unsigned long long) next,
 359 -                                mpd->b_size >> mpd->inode->i_blkbits, err);
 360 -                       ext4_msg(sb, KERN_CRIT,
 361 -                               "This should not happen!! Data will be lost");
 362 -                       if (err == -ENOSPC)
 363 -                               ext4_print_free_blocks(mpd->inode);
 364 -               }
 365 -               /* invalidate all the pages */
 366 -               ext4_da_block_invalidatepages(mpd);
 367 -
 368 -               /* Mark this page range as having been completed */
 369 -               mpd->io_done = 1;
 370 -               return;
 371 -       }
 372 -       BUG_ON(blks == 0);
 373 -
 374 -       mapp = &map;
 375 -       if (map.m_flags & EXT4_MAP_NEW) {
 376 -               struct block_device *bdev = mpd->inode->i_sb->s_bdev;
 377 -               int i;
 378 -
 379 -               for (i = 0; i < map.m_len; i++)
 380 -                       unmap_underlying_metadata(bdev, map.m_pblk + i);
 381 -       }
 382 -
 383 -       /*
 384 -        * Update on-disk size along with block allocation.
 385 -        */
 386 -       disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
 387 -       if (disksize > i_size_read(mpd->inode))
 388 -               disksize = i_size_read(mpd->inode);
 389 -       if (disksize > EXT4_I(mpd->inode)->i_disksize) {
 390 -               ext4_update_i_disksize(mpd->inode, disksize);
 391 -               err = ext4_mark_inode_dirty(handle, mpd->inode);
 392 -               if (err)
 393 -                       ext4_error(mpd->inode->i_sb,
 394 -                                  "Failed to mark inode %lu dirty",
 395 -                                  mpd->inode->i_ino);
 396 -       }
 397 -
 398 -submit_io:
 399 -       mpage_da_submit_io(mpd, mapp);
 400 -       mpd->io_done = 1;
 401 -}
 402 -
 403 -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
 404 -               (1 << BH_Delay) | (1 << BH_Unwritten))
 405 -
 406 -/*
 407 - * mpage_add_bh_to_extent - try to add one more block to extent of blocks
 408 - *
 409 - * @mpd->lbh - extent of blocks
 410 - * @logical - logical number of the block in the file
 411 - * @b_state - b_state of the buffer head added
 412 - *
 413 - * the function is used to collect contig. blocks in same state
 414 - */
 415 -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
 416 -                                  unsigned long b_state)
 417 -{
 418 -       sector_t next;
 419 -       int blkbits = mpd->inode->i_blkbits;
 420 -       int nrblocks = mpd->b_size >> blkbits;
 421 -
 422 -       /*
 423 -        * XXX Don't go larger than mballoc is willing to allocate
 424 -        * This is a stopgap solution.  We eventually need to fold
 425 -        * mpage_da_submit_io() into this function and then call
 426 -        * ext4_map_blocks() multiple times in a loop
 427 -        */
 428 -       if (nrblocks >= (8*1024*1024 >> blkbits))
 429 -               goto flush_it;
 430 -
 431 -       /* check if the reserved journal credits might overflow */
 432 -       if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
 433 -               if (nrblocks >= EXT4_MAX_TRANS_DATA) {
 434 -                       /*
 435 -                        * With non-extent format we are limited by the journal
 436 -                        * credit available.  Total credit needed to insert
 437 -                        * nrblocks contiguous blocks is dependent on the
 438 -                        * nrblocks.  So limit nrblocks.
 439 -                        */
 440 -                       goto flush_it;
 441 -               }
 442 -       }
 443 -       /*
 444 -        * First block in the extent
 445 -        */
 446 -       if (mpd->b_size == 0) {
 447 -               mpd->b_blocknr = logical;
 448 -               mpd->b_size = 1 << blkbits;
 449 -               mpd->b_state = b_state & BH_FLAGS;
 450 -               return;
 451 -       }
 452 -
 453 -       next = mpd->b_blocknr + nrblocks;
 454 -       /*
 455 -        * Can we merge the block to our big extent?
 456 -        */
 457 -       if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
 458 -               mpd->b_size += 1 << blkbits;
 459 -               return;
 460 -       }
 461 -
 462 -flush_it:
 463 -       /*
 464 -        * We couldn't merge the block to our extent, so we
 465 -        * need to flush current  extent and start new one
 466 -        */
 467 -       mpage_da_map_and_submit(mpd);
 468 -       return;
 469 -}
 470 -
 471  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 472  {
 473         return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
 474 @@ -2204,6 +1889,8 @@ static int ext4_writepage(struct page *page,
 475         return ret;
 476  }
 477
 478 +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
 479 +
 480  /*
 481   * mballoc gives us at most this number of blocks...
 482   * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 483 @@ -2212,6 +1899,315 @@ static int ext4_writepage(struct page *page,
 484  #define MAX_WRITEPAGES_EXTENT_LEN 2048
 485
 486  /*
 487 + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
 488 + *
 489 + * @mpd - extent of blocks
 490 + * @lblk - logical number of the block in the file
 491 + * @b_state - b_state of the buffer head added
 492 + *
 493 + * the function is used to collect contig. blocks in same state
 494 + */
 495 +static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
 496 +                                 unsigned long b_state)
 497 +{
 498 +       struct ext4_map_blocks *map = &mpd->map;
 499 +
 500 +       /* Don't go larger than mballoc is willing to allocate */
 501 +       if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
 502 +               return 0;
 503 +
 504 +       /* First block in the extent? */
 505 +       if (map->m_len == 0) {
 506 +               map->m_lblk = lblk;
 507 +               map->m_len = 1;
 508 +               map->m_flags = b_state & BH_FLAGS;
 509 +               return 1;
 510 +       }
 511 +
 512 +       /* Can we merge the block to our big extent? */
 513 +       if (lblk == map->m_lblk + map->m_len &&
 514 +           (b_state & BH_FLAGS) == map->m_flags) {
 515 +               map->m_len++;
 516 +               return 1;
 517 +       }
 518 +       return 0;
 519 +}
 520 +
 521 +static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
 522 +                                   struct buffer_head *head,
 523 +                                   struct buffer_head *bh,
 524 +                                   ext4_lblk_t lblk)
 525 +{
 526 +       struct inode *inode = mpd->inode;
 527 +       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
 528 +                                                       >> inode->i_blkbits;
 529 +
 530 +       do {
 531 +               BUG_ON(buffer_locked(bh));
 532 +
 533 +               if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
 534 +                   (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
 535 +                   lblk >= blocks) {
 536 +                       /* Found extent to map? */
 537 +                       if (mpd->map.m_len)
 538 +                               return false;
 539 +                       if (lblk >= blocks)
 540 +                               return true;
 541 +                       continue;
 542 +               }
 543 +               if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
 544 +                       return false;
 545 +       } while (lblk++, (bh = bh->b_this_page) != head);
 546 +       return true;
 547 +}
 548 +
 549 +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 550 +{
 551 +       int len;
 552 +       loff_t size = i_size_read(mpd->inode);
 553 +       int err;
 554 +
 555 +       BUG_ON(page->index != mpd->first_page);
 556 +       if (page->index == size >> PAGE_CACHE_SHIFT)
 557 +               len = size & ~PAGE_CACHE_MASK;
 558 +       else
 559 +               len = PAGE_CACHE_SIZE;
 560 +       clear_page_dirty_for_io(page);
 561 +       err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
 562 +       if (!err)
 563 +               mpd->wbc->nr_to_write--;
 564 +       mpd->first_page++;
 565 +
 566 +       return err;
 567 +}
 568 +
 569 +/*
 570 + * mpage_map_buffers - update buffers corresponding to changed extent and
 571 + *                    submit fully mapped pages for IO
 572 + *
 573 + * @mpd - description of extent to map, on return next extent to map
 574 + *
 575 + * Scan buffers corresponding to changed extent (we expect corresponding pages
 576 + * to be already locked) and update buffer state according to new extent state.
 577 + * We map delalloc buffers to their physical location, clear unwritten bits,
 578 + * and mark buffers as uninit when we perform writes to uninitialized extents
 579 + * and do extent conversion after IO is finished. If the last page is not fully
 580 + * mapped, we update @map to the next extent in the last page that needs
 581 + * mapping. Otherwise we submit the page for IO.
 582 + */
 583 +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 584 +{
 585 +       struct pagevec pvec;
 586 +       int nr_pages, i;
 587 +       struct inode *inode = mpd->inode;
 588 +       struct buffer_head *head, *bh;
 589 +       int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
 590 +       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
 591 +                                                       >> inode->i_blkbits;
 592 +       pgoff_t start, end;
 593 +       ext4_lblk_t lblk;
 594 +       sector_t pblock;
 595 +       int err;
 596 +
 597 +       start = mpd->map.m_lblk >> bpp_bits;
 598 +       end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
 599 +       lblk = start << bpp_bits;
 600 +       pblock = mpd->map.m_pblk;
 601 +
 602 +       pagevec_init(&pvec, 0);
 603 +       while (start <= end) {
 604 +               nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
 605 +                                         PAGEVEC_SIZE);
 606 +               if (nr_pages == 0)
 607 +                       break;
 608 +               for (i = 0; i < nr_pages; i++) {
 609 +                       struct page *page = pvec.pages[i];
 610 +
 611 +                       if (page->index > end)
 612 +                               break;
 613 +                       /* Upto 'end' pages must be contiguous */
 614 +                       BUG_ON(page->index != start);
 615 +                       bh = head = page_buffers(page);
 616 +                       do {
 617 +                               if (lblk < mpd->map.m_lblk)
 618 +                                       continue;
 619 +                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
 620 +                                       /*
 621 +                                        * Buffer after end of mapped extent.
 622 +                                        * Find next buffer in the page to map.
 623 +                                        */
 624 +                                       mpd->map.m_len = 0;
 625 +                                       mpd->map.m_flags = 0;
 626 +                                       add_page_bufs_to_extent(mpd, head, bh,
 627 +                                                               lblk);
 628 +                                       pagevec_release(&pvec);
 629 +                                       return 0;
 630 +                               }
 631 +                               if (buffer_delay(bh)) {
 632 +                                       clear_buffer_delay(bh);
 633 +                                       bh->b_blocknr = pblock++;
 634 +                               }
 635 +                               if (mpd->map.m_flags & EXT4_MAP_UNINIT)
 636 +                                       set_buffer_uninit(bh);
 637 +                               clear_buffer_unwritten(bh);
 638 +                       } while (++lblk < blocks &&
 639 +                                (bh = bh->b_this_page) != head);
 640 +
 641 +                       /*
 642 +                        * FIXME: This is going to break if dioread_nolock
 643 +                        * supports blocksize < pagesize as we will try to
 644 +                        * convert potentially unmapped parts of inode.
 645 +                        */
 646 +                       mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
 647 +                       /* Page fully mapped - let IO run! */
 648 +                       err = mpage_submit_page(mpd, page);
 649 +                       if (err < 0) {
 650 +                               pagevec_release(&pvec);
 651 +                               return err;
 652 +                       }
 653 +                       start++;
 654 +               }
 655 +               pagevec_release(&pvec);
 656 +       }
 657 +       /* Extent fully mapped and matches with page boundary. We are done. */
 658 +       mpd->map.m_len = 0;
 659 +       mpd->map.m_flags = 0;
 660 +       return 0;
 661 +}
 662 +
 663 +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 664 +{
 665 +       struct inode *inode = mpd->inode;
 666 +       struct ext4_map_blocks *map = &mpd->map;
 667 +       int get_blocks_flags;
 668 +       int err;
 669 +
 670 +       trace_ext4_da_write_pages_extent(inode, map);
 671 +       /*
 672 +        * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
 673 +        * to convert an uninitialized extent to be initialized (in the case
 674 +        * where we have written into one or more preallocated blocks).  It is
 675 +        * possible that we're going to need more metadata blocks than
 676 +        * previously reserved. However we must not fail because we're in
 677 +        * writeback and there is nothing we can do about it so it might result
 678 +        * in data loss.  So use reserved blocks to allocate metadata if
 679 +        * possible.
 680 +        *
 681 +        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
 682 +        * in question are delalloc blocks.  This affects functions in many
 683 +        * different parts of the allocation call path.  This flag exists
 684 +        * primarily because we don't want to change *many* call functions, so
 685 +        * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
 686 +        * once the inode's allocation semaphore is taken.
 687 +        */
 688 +       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
 689 +                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
 690 +       if (ext4_should_dioread_nolock(inode))
 691 +               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 692 +       if (map->m_flags & (1 << BH_Delay))
 693 +               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 694 +
 695 +       err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
 696 +       if (err < 0)
 697 +               return err;
 698 +
 699 +       BUG_ON(map->m_len == 0);
 700 +       if (map->m_flags & EXT4_MAP_NEW) {
 701 +               struct block_device *bdev = inode->i_sb->s_bdev;
 702 +               int i;
 703 +
 704 +               for (i = 0; i < map->m_len; i++)
 705 +                       unmap_underlying_metadata(bdev, map->m_pblk + i);
 706 +       }
 707 +       return 0;
 708 +}
 709 +
 710 +/*
 711 + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
 712 + *                              mpd->len and submit pages underlying it for IO
 713 + *
 714 + * @handle - handle for journal operations
 715 + * @mpd - extent to map
 716 + *
 717 + * The function maps extent starting at mpd->lblk of length mpd->len. If it is
 718 + * delayed, blocks are allocated, if it is unwritten, we may need to convert
 719 + * them to initialized or split the described range from larger unwritten
 720 + * extent. Note that we need not map all the described range since allocation
 721 + * can return less blocks or the range is covered by more unwritten extents. We
 722 + * cannot map more because we are limited by reserved transaction credits. On
 723 + * the other hand we always make sure that the last touched page is fully
 724 + * mapped so that it can be written out (and thus forward progress is
 725 + * guaranteed). After mapping we submit all mapped pages for IO.
 726 + */
 727 +static int mpage_map_and_submit_extent(handle_t *handle,
 728 +                                      struct mpage_da_data *mpd)
 729 +{
 730 +       struct inode *inode = mpd->inode;
 731 +       struct ext4_map_blocks *map = &mpd->map;
 732 +       int err;
 733 +       loff_t disksize;
 734 +
 735 +       mpd->io_submit.io_end->offset =
 736 +                               ((loff_t)map->m_lblk) << inode->i_blkbits;
 737 +       while (map->m_len) {
 738 +               err = mpage_map_one_extent(handle, mpd);
 739 +               if (err < 0) {
 740 +                       struct super_block *sb = inode->i_sb;
 741 +
 742 +                       /*
 743 +                        * Need to commit transaction to free blocks. Let upper
 744 +                        * layers sort it out.
 745 +                        */
 746 +                       if (err == -ENOSPC && ext4_count_free_clusters(sb))
 747 +                               return -ENOSPC;
 748 +
 749 +                       if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 750 +                               ext4_msg(sb, KERN_CRIT,
 751 +                                        "Delayed block allocation failed for "
 752 +                                        "inode %lu at logical offset %llu with"
 753 +                                        " max blocks %u with error %d",
 754 +                                        inode->i_ino,
 755 +                                        (unsigned long long)map->m_lblk,
 756 +                                        (unsigned)map->m_len, err);
 757 +                               ext4_msg(sb, KERN_CRIT,
 758 +                                        "This should not happen!! Data will "
 759 +                                        "be lost\n");
 760 +                               if (err == -ENOSPC)
 761 +                                       ext4_print_free_blocks(inode);
 762 +                       }
 763 +                       /* invalidate all the pages */
 764 +                       mpage_release_unused_pages(mpd, true);
 765 +                       return err;
 766 +               }
 767 +               /*
 768 +                * Update buffer state, submit mapped pages, and get us new
 769 +                * extent to map
 770 +                */
 771 +               err = mpage_map_and_submit_buffers(mpd);
 772 +               if (err < 0)
 773 +                       return err;
 774 +       }
 775 +
 776 +       /* Update on-disk size after IO is submitted */
 777 +       disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
 778 +       if (disksize > i_size_read(inode))
 779 +               disksize = i_size_read(inode);
 780 +       if (disksize > EXT4_I(inode)->i_disksize) {
 781 +               int err2;
 782 +
 783 +               ext4_update_i_disksize(inode, disksize);
 784 +               err2 = ext4_mark_inode_dirty(handle, inode);
 785 +               if (err2)
 786 +                       ext4_error(inode->i_sb,
 787 +                                  "Failed to mark inode %lu dirty",
 788 +                                  inode->i_ino);
 789 +               if (!err)
 790 +                       err = err2;
 791 +       }
 792 +       return err;
 793 +}
 794 +
 795 +/*
 796   * Calculate the total number of credits to reserve for one writepages
 797   * iteration. This is called from ext4_da_writepages(). We map an extent of
 798   * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
 799 @@ -2227,44 +2223,49 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 800  }
 801
 802  /*
 803 - * write_cache_pages_da - walk the list of dirty pages of the given
 804 - * address space and accumulate pages that need writing, and call
 805 - * mpage_da_map_and_submit to map a single contiguous memory region
 806 - * and then write them.
 807 + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
 808 + *                              and underlying extent to map
 809 + *
 810 + * @mpd - where to look for pages
 811 + *
 812 + * Walk dirty pages in the mapping. If they are fully mapped, submit them for
 813 + * IO immediately. When we find a page which isn't mapped we start accumulating
 814 + * extent of buffers underlying these pages that needs mapping (formed by
 815 + * either delayed or unwritten buffers). We also lock the pages containing
 816 + * these buffers. The extent found is returned in @mpd structure (starting at
 817 + * mpd->lblk with length mpd->len blocks).
 818 + *
 819 + * Note that this function can attach bios to one io_end structure which are
 820 + * neither logically nor physically contiguous. Although it may seem as an
 821 + * unnecessary complication, it is actually inevitable in blocksize < pagesize
 822 + * case as we need to track IO to all buffers underlying a page in one io_end.
 823   */
 824 -static int write_cache_pages_da(handle_t *handle,
 825 -                               struct address_space *mapping,
 826 -                               struct writeback_control *wbc,
 827 -                               struct mpage_da_data *mpd,
 828 -                               pgoff_t *done_index)
 829 +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 830  {
 831 -       struct buffer_head      *bh, *head;
 832 -       struct inode            *inode = mapping->host;
 833 -       struct pagevec          pvec;
 834 -       unsigned int            nr_pages;
 835 -       sector_t                logical;
 836 -       pgoff_t                 index, end;
 837 -       long                    nr_to_write = wbc->nr_to_write;
 838 -       int                     i, tag, ret = 0;
 839 -
 840 -       memset(mpd, 0, sizeof(struct mpage_da_data));
 841 -       mpd->wbc = wbc;
 842 -       mpd->inode = inode;
 843 -       pagevec_init(&pvec, 0);
 844 -       index = wbc->range_start >> PAGE_CACHE_SHIFT;
 845 -       end = wbc->range_end >> PAGE_CACHE_SHIFT;
 846 +       struct address_space *mapping = mpd->inode->i_mapping;
 847 +       struct pagevec pvec;
 848 +       unsigned int nr_pages;
 849 +       pgoff_t index = mpd->first_page;
 850 +       pgoff_t end = mpd->last_page;
 851 +       int tag;
 852 +       int i, err = 0;
 853 +       int blkbits = mpd->inode->i_blkbits;
 854 +       ext4_lblk_t lblk;
 855 +       struct buffer_head *head;
 856
 857 -       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 858 +       if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
 859                 tag = PAGECACHE_TAG_TOWRITE;
 860         else
 861                 tag = PAGECACHE_TAG_DIRTY;
 862
 863 -       *done_index = index;
 864 +       pagevec_init(&pvec, 0);
 865 +       mpd->map.m_len = 0;
 866 +       mpd->next_page = index;
 867         while (index <= end) {
 868                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 869                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 870                 if (nr_pages == 0)
 871 -                       return 0;
 872 +                       goto out;
 873
 874                 for (i = 0; i < nr_pages; i++) {
 875                         struct page *page = pvec.pages[i];
 876 @@ -2279,31 +2280,21 @@ static int write_cache_pages_da(handle_t *handle,
 877                         if (page->index > end)
 878                                 goto out;
 879
 880 -                       *done_index = page->index + 1;
 881 -
 882 -                       /*
 883 -                        * If we can't merge this page, and we have
 884 -                        * accumulated an contiguous region, write it
 885 -                        */
 886 -                       if ((mpd->next_page != page->index) &&
 887 -                           (mpd->next_page != mpd->first_page)) {
 888 -                               mpage_da_map_and_submit(mpd);
 889 -                               goto ret_extent_tail;
 890 -                       }
 891 +                       /* If we can't merge this page, we are done. */
 892 +                       if (mpd->map.m_len > 0 && mpd->next_page != page->index)
 893 +                               goto out;
 894
 895                         lock_page(page);
 896 -
 897                         /*
 898 -                        * If the page is no longer dirty, or its
 899 -                        * mapping no longer corresponds to inode we
 900 -                        * are writing (which means it has been
 901 -                        * truncated or invalidated), or the page is
 902 -                        * already under writeback and we are not
 903 -                        * doing a data integrity writeback, skip the page
 904 +                        * If the page is no longer dirty, or its mapping no
 905 +                        * longer corresponds to inode we are writing (which
 906 +                        * means it has been truncated or invalidated), or the
 907 +                        * page is already under writeback and we are not doing
 908 +                        * a data integrity writeback, skip the page
 909                          */
 910                         if (!PageDirty(page) ||
 911                             (PageWriteback(page) &&
 912 -                            (wbc->sync_mode == WB_SYNC_NONE)) ||
 913 +                            (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
 914                             unlikely(page->mapping != mapping)) {
 915                                 unlock_page(page);
 916                                 continue;
 917 @@ -2312,101 +2303,57 @@ static int write_cache_pages_da(handle_t *handle,
 918                         wait_on_page_writeback(page);
 919                         BUG_ON(PageWriteback(page));
 920
 921 -                       /*
 922 -                        * If we have inline data and arrive here, it means that
 923 -                        * we will soon create the block for the 1st page, so
 924 -                        * we'd better clear the inline data here.
 925 -                        */
 926 -                       if (ext4_has_inline_data(inode)) {
 927 -                               BUG_ON(ext4_test_inode_state(inode,
 928 -                                               EXT4_STATE_MAY_INLINE_DATA));
 929 -                               ext4_destroy_inline_data(handle, inode);
 930 -                       }
 931 -
 932 -                       if (mpd->next_page != page->index)
 933 +                       if (mpd->map.m_len == 0)
 934                                 mpd->first_page = page->index;
 935                         mpd->next_page = page->index + 1;
 936 -                       logical = (sector_t) page->index <<
 937 -                               (PAGE_CACHE_SHIFT - inode->i_blkbits);
 938 -
 939                         /* Add all dirty buffers to mpd */
 940 +                       lblk = ((ext4_lblk_t)page->index) <<
 941 +                               (PAGE_CACHE_SHIFT - blkbits);
 942                         head = page_buffers(page);
 943 -                       bh = head;
 944 -                       do {
 945 -                               BUG_ON(buffer_locked(bh));
 946 -                               /*
 947 -                                * We need to try to allocate unmapped blocks
 948 -                                * in the same page.  Otherwise we won't make
 949 -                                * progress with the page in ext4_writepage
 950 -                                */
 951 -                               if (ext4_bh_delay_or_unwritten(NULL, bh)) {
 952 -                                       mpage_add_bh_to_extent(mpd, logical,
 953 -                                                              bh->b_state);
 954 -                                       if (mpd->io_done)
 955 -                                               goto ret_extent_tail;
 956 -                               } else if (buffer_dirty(bh) &&
 957 -                                          buffer_mapped(bh)) {
 958 -                                       /*
 959 -                                        * mapped dirty buffer. We need to
 960 -                                        * update the b_state because we look
 961 -                                        * at b_state in mpage_da_map_blocks.
 962 -                                        * We don't update b_size because if we
 963 -                                        * find an unmapped buffer_head later
 964 -                                        * we need to use the b_state flag of
 965 -                                        * that buffer_head.
 966 -                                        */
 967 -                                       if (mpd->b_size == 0)
 968 -                                               mpd->b_state =
 969 -                                                       bh->b_state & BH_FLAGS;
 970 -                               }
 971 -                               logical++;
 972 -                       } while ((bh = bh->b_this_page) != head);
 973 -
 974 -                       if (nr_to_write > 0) {
 975 -                               nr_to_write--;
 976 -                               if (nr_to_write == 0 &&
 977 -                                   wbc->sync_mode == WB_SYNC_NONE)
 978 -                                       /*
 979 -                                        * We stop writing back only if we are
 980 -                                        * not doing integrity sync. In case of
 981 -                                        * integrity sync we have to keep going
 982 -                                        * because someone may be concurrently
 983 -                                        * dirtying pages, and we might have
 984 -                                        * synced a lot of newly appeared dirty
 985 -                                        * pages, but have not synced all of the
 986 -                                        * old dirty pages.
 987 -                                        */
 988 +                       if (!add_page_bufs_to_extent(mpd, head, head, lblk))
 989 +                               goto out;
 990 +                       /* So far everything mapped? Submit the page for IO. */
 991 +                       if (mpd->map.m_len == 0) {
 992 +                               err = mpage_submit_page(mpd, page);
 993 +                               if (err < 0)
 994                                         goto out;
 995                         }
 996 +
 997 +                       /*
 998 +                        * Accumulated enough dirty pages? This doesn't apply
 999 +                        * to WB_SYNC_ALL mode. For integrity sync we have to
1000 +                        * keep going because someone may be concurrently
1001 +                        * dirtying pages, and we might have synced a lot of
1002 +                        * newly appeared dirty pages, but have not synced all
1003 +                        * of the old dirty pages.
1004 +                        */
1005 +                       if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
1006 +                           mpd->next_page - mpd->first_page >=
1007 +                                                       mpd->wbc->nr_to_write)
1008 +                               goto out;
1009                 }
1010                 pagevec_release(&pvec);
1011                 cond_resched();
1012         }
1013         return 0;
1014 -ret_extent_tail:
1015 -       ret = MPAGE_DA_EXTENT_TAIL;
1016  out:
1017         pagevec_release(&pvec);
1018 -       cond_resched();
1019 -       return ret;
1020 +       return err;
1021  }
1022
1023 -
1024  static int ext4_da_writepages(struct address_space *mapping,
1025                               struct writeback_control *wbc)
1026  {
1027 -       pgoff_t index;
1028 +       pgoff_t writeback_index = 0;
1029 +       long nr_to_write = wbc->nr_to_write;
1030         int range_whole = 0;
1031 +       int cycled = 1;
1032         handle_t *handle = NULL;
1033         struct mpage_da_data mpd;
1034         struct inode *inode = mapping->host;
1035 -       int pages_written = 0;
1036 -       int range_cyclic, cycled = 1, io_done = 0;
1037         int needed_blocks, ret = 0;
1038 -       loff_t range_start = wbc->range_start;
1039         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
1040 -       pgoff_t done_index = 0;
1041 -       pgoff_t end;
1042 +       bool done;
1043         struct blk_plug plug;
1044
1045         trace_ext4_da_writepages(inode, wbc);
1046 @@ -2432,40 +2379,65 @@ static int ext4_da_writepages(struct address_space *mapping,
1047         if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
1048                 return -EROFS;
1049
1050 +       /*
1051 +        * If we have inline data and arrive here, it means that
1052 +        * we will soon create the block for the 1st page, so
1053 +        * we'd better clear the inline data here.
1054 +        */
1055 +       if (ext4_has_inline_data(inode)) {
1056 +               /* Just inode will be modified... */
1057 +               handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
1058 +               if (IS_ERR(handle)) {
1059 +                       ret = PTR_ERR(handle);
1060 +                       goto out_writepages;
1061 +               }
1062 +               BUG_ON(ext4_test_inode_state(inode,
1063 +                               EXT4_STATE_MAY_INLINE_DATA));
1064 +               ext4_destroy_inline_data(handle, inode);
1065 +               ext4_journal_stop(handle);
1066 +       }
1067 +
1068         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1069                 range_whole = 1;
1070
1071 -       range_cyclic = wbc->range_cyclic;
1072         if (wbc->range_cyclic) {
1073 -               index = mapping->writeback_index;
1074 -               if (index)
1075 +               writeback_index = mapping->writeback_index;
1076 +               if (writeback_index)
1077                         cycled = 0;
1078 -               wbc->range_start = index << PAGE_CACHE_SHIFT;
1079 -               wbc->range_end  = LLONG_MAX;
1080 -               wbc->range_cyclic = 0;
1081 -               end = -1;
1082 +               mpd.first_page = writeback_index;
1083 +               mpd.last_page = -1;
1084         } else {
1085 -               index = wbc->range_start >> PAGE_CACHE_SHIFT;
1086 -               end = wbc->range_end >> PAGE_CACHE_SHIFT;
1087 +               mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
1088 +               mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
1089         }
1090
1091 +       mpd.inode = inode;
1092 +       mpd.wbc = wbc;
1093 +       ext4_io_submit_init(&mpd.io_submit, wbc);
1094  retry:
1095         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1096 -               tag_pages_for_writeback(mapping, index, end);
1097 -
1098 +               tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
1099 +       done = false;
1100         blk_start_plug(&plug);
1101 -       while (!ret && wbc->nr_to_write > 0) {
1102 +       while (!done && mpd.first_page <= mpd.last_page) {
1103 +               /* For each extent of pages we use new io_end */
1104 +               mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
1105 +               if (!mpd.io_submit.io_end) {
1106 +                       ret = -ENOMEM;
1107 +                       break;
1108 +               }
1109
1110                 /*
1111 -                * we  insert one extent at a time. So we need
1112 -                * credit needed for single extent allocation.
1113 -                * journalled mode is currently not supported
1114 -                * by delalloc
1115 +                * We have two constraints: We find one extent to map and we
1116 +                * must always write out whole page (makes a difference when
1117 +                * blocksize < pagesize) so that we don't block on IO when we
1118 +                * try to write out the rest of the page. Journalled mode is
1119 +                * not supported by delalloc.
1120                  */
1121                 BUG_ON(ext4_should_journal_data(inode));
1122                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
1123
1124 -               /* start a new transaction*/
1125 +               /* start a new transaction */
1126                 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
1127                                             needed_blocks);
1128                 if (IS_ERR(handle)) {
1129 @@ -2473,76 +2445,67 @@ retry:
1130                         ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
1131                                "%ld pages, ino %lu; err %d", __func__,
1132                                 wbc->nr_to_write, inode->i_ino, ret);
1133 -                       blk_finish_plug(&plug);
1134 -                       goto out_writepages;
1135 +                       /* Release allocated io_end */
1136 +                       ext4_put_io_end(mpd.io_submit.io_end);
1137 +                       break;
1138                 }
1139
1140 -               /*
1141 -                * Now call write_cache_pages_da() to find the next
1142 -                * contiguous region of logical blocks that need
1143 -                * blocks to be allocated by ext4 and submit them.
1144 -                */
1145 -               ret = write_cache_pages_da(handle, mapping,
1146 -                                          wbc, &mpd, &done_index);
1147 -               /*
1148 -                * If we have a contiguous extent of pages and we
1149 -                * haven't done the I/O yet, map the blocks and submit
1150 -                * them for I/O.
1151 -                */
1152 -               if (!mpd.io_done && mpd.next_page != mpd.first_page) {
1153 -                       mpage_da_map_and_submit(&mpd);
1154 -                       ret = MPAGE_DA_EXTENT_TAIL;
1155 +               trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
1156 +               ret = mpage_prepare_extent_to_map(&mpd);
1157 +               if (!ret) {
1158 +                       if (mpd.map.m_len)
1159 +                               ret = mpage_map_and_submit_extent(handle, &mpd);
1160 +                       else {
1161 +                               /*
1162 +                                * We scanned the whole range (or exhausted
1163 +                                * nr_to_write), submitted what was mapped and
1164 +                                * didn't find anything needing mapping. We are
1165 +                                * done.
1166 +                                */
1167 +                               done = true;
1168 +                       }
1169                 }
1170 -               trace_ext4_da_write_pages(inode, &mpd);
1171 -               wbc->nr_to_write -= mpd.pages_written;
1172 -
1173                 ext4_journal_stop(handle);
1174 -
1175 -               if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
1176 -                       /* commit the transaction which would
1177 +               /* Submit prepared bio */
1178 +               ext4_io_submit(&mpd.io_submit);
1179 +               /* Unlock pages we didn't use */
1180 +               mpage_release_unused_pages(&mpd, false);
1181 +               /* Drop our io_end reference we got from init */
1182 +               ext4_put_io_end(mpd.io_submit.io_end);
1183 +
1184 +               if (ret == -ENOSPC && sbi->s_journal) {
1185 +                       /*
1186 +                        * Commit the transaction which would
1187                          * free blocks released in the transaction
1188                          * and try again
1189                          */
1190                         jbd2_journal_force_commit_nested(sbi->s_journal);
1191                         ret = 0;
1192 -               } else if (ret == MPAGE_DA_EXTENT_TAIL) {
1193 -                       /*
1194 -                        * Got one extent now try with rest of the pages.
1195 -                        * If mpd.retval is set -EIO, journal is aborted.
1196 -                        * So we don't need to write any more.
1197 -                        */
1198 -                       pages_written += mpd.pages_written;
1199 -                       ret = mpd.retval;
1200 -                       io_done = 1;
1201 -               } else if (wbc->nr_to_write)
1202 -                       /*
1203 -                        * There is no more writeout needed
1204 -                        * or we requested for a noblocking writeout
1205 -                        * and we found the device congested
1206 -                        */
1207 +                       continue;
1208 +               }
1209 +               /* Fatal error - ENOMEM, EIO... */
1210 +               if (ret)
1211                         break;
1212         }
1213         blk_finish_plug(&plug);
1214 -       if (!io_done && !cycled) {
1215 +       if (!ret && !cycled) {
1216                 cycled = 1;
1217 -               index = 0;
1218 -               wbc->range_start = index << PAGE_CACHE_SHIFT;
1219 -               wbc->range_end  = mapping->writeback_index - 1;
1220 +               mpd.last_page = writeback_index - 1;
1221 +               mpd.first_page = 0;
1222                 goto retry;
1223         }
1224
1225         /* Update index */
1226 -       wbc->range_cyclic = range_cyclic;
1227         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1228                 /*
1229 -                * set the writeback_index so that range_cyclic
1230 +                * Set the writeback_index so that range_cyclic
1231                  * mode will write it back later
1232                  */
1233 -               mapping->writeback_index = done_index;
1234 +               mapping->writeback_index = mpd.first_page;
1235
1236  out_writepages:
1237 -       wbc->range_start = range_start;
1238 -       trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
1239 +       trace_ext4_da_writepages_result(inode, wbc, ret,
1240 +                                       nr_to_write - wbc->nr_to_write);
1241         return ret;
1242  }
1243
1244 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
1245 index 19599bd..3e58546 100644
1246 --- a/fs/ext4/page-io.c
1247 +++ b/fs/ext4/page-io.c
1248 @@ -360,9 +360,6 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
1249         bio->bi_bdev = bh->b_bdev;
1250         bio->bi_end_io = ext4_end_bio;
1251         bio->bi_private = ext4_get_io_end(io->io_end);
1252 -       if (!io->io_end->size)
1253 -               io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
1254 -                                    + bh_offset(bh);
1255         io->io_bio = bio;
1256         io->io_next_block = bh->b_blocknr;
1257         return 0;
1258 @@ -390,7 +387,6 @@ submit_and_retry:
1259         io_end = io->io_end;
1260         if (test_clear_buffer_uninit(bh))
1261                 ext4_set_io_unwritten_flag(inode, io_end);
1262 -       io_end->size += bh->b_size;
1263         io->io_next_block++;
1264         return 0;
1265  }
1266 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
1267 index e23b218..08a37a4 100644
1268 --- a/include/trace/events/ext4.h
1269 +++ b/include/trace/events/ext4.h
1270 @@ -324,43 +324,59 @@ TRACE_EVENT(ext4_da_writepages,
1271  );
1272
1273  TRACE_EVENT(ext4_da_write_pages,
1274 -       TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
1275 +       TP_PROTO(struct inode *inode, pgoff_t first_page,
1276 +                struct writeback_control *wbc),
1277
1278 -       TP_ARGS(inode, mpd),
1279 +       TP_ARGS(inode, first_page, wbc),
1280
1281         TP_STRUCT__entry(
1282                 __field(        dev_t,  dev                     )
1283                 __field(        ino_t,  ino                     )
1284 -               __field(        __u64,  b_blocknr               )
1285 -               __field(        __u32,  b_size                  )
1286 -               __field(        __u32,  b_state                 )
1287 -               __field(        unsigned long,  first_page      )
1288 -               __field(        int,    io_done                 )
1289 -               __field(        int,    pages_written           )
1290 -               __field(        int,    sync_mode               )
1291 +               __field(      pgoff_t,  first_page              )
1292 +               __field(         long,  nr_to_write             )
1293 +               __field(          int,  sync_mode               )
1294         ),
1295
1296         TP_fast_assign(
1297                 __entry->dev            = inode->i_sb->s_dev;
1298                 __entry->ino            = inode->i_ino;
1299 -               __entry->b_blocknr      = mpd->b_blocknr;
1300 -               __entry->b_size         = mpd->b_size;
1301 -               __entry->b_state        = mpd->b_state;
1302 -               __entry->first_page     = mpd->first_page;
1303 -               __entry->io_done        = mpd->io_done;
1304 -               __entry->pages_written  = mpd->pages_written;
1305 -               __entry->sync_mode      = mpd->wbc->sync_mode;
1306 +               __entry->first_page     = first_page;
1307 +               __entry->nr_to_write    = wbc->nr_to_write;
1308 +               __entry->sync_mode      = wbc->sync_mode;
1309         ),
1310
1311 -       TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
1312 -                 "first_page %lu io_done %d pages_written %d sync_mode %d",
1313 +       TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
1314 +                 "sync_mode %d",
1315                   MAJOR(__entry->dev), MINOR(__entry->dev),
1316 -                 (unsigned long) __entry->ino,
1317 -                 __entry->b_blocknr, __entry->b_size,
1318 -                 __entry->b_state, __entry->first_page,
1319 -                 __entry->io_done, __entry->pages_written,
1320 -                 __entry->sync_mode
1321 -                  )
1322 +                 (unsigned long) __entry->ino, __entry->first_page,
1323 +                 __entry->nr_to_write, __entry->sync_mode)
1324 +);
1325 +
1326 +TRACE_EVENT(ext4_da_write_pages_extent,
1327 +       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
1328 +
1329 +       TP_ARGS(inode, map),
1330 +
1331 +       TP_STRUCT__entry(
1332 +               __field(        dev_t,  dev                     )
1333 +               __field(        ino_t,  ino                     )
1334 +               __field(        __u64,  lblk                    )
1335 +               __field(        __u32,  len                     )
1336 +               __field(        __u32,  flags                   )
1337 +       ),
1338 +
1339 +       TP_fast_assign(
1340 +               __entry->dev            = inode->i_sb->s_dev;
1341 +               __entry->ino            = inode->i_ino;
1342 +               __entry->lblk           = map->m_lblk;
1343 +               __entry->len            = map->m_len;
1344 +               __entry->flags          = map->m_flags;
1345 +       ),
1346 +
1347 +       TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
1348 +                 MAJOR(__entry->dev), MINOR(__entry->dev),
1349 +                 (unsigned long) __entry->ino, __entry->lblk, __entry->len,
1350 +                 __entry->flags)
1351  );
1352
1353  TRACE_EVENT(ext4_da_writepages_result,