add patch remove-ext4_ioend_wait
[ext4-patch-queue.git] / restructure-writeback-path
blob34f961a859b363f16ae7f8db93e45c4fc4bc7c34
1 ext4: restructure writeback path
3 From: Jan Kara <jack@suse.cz>
5 There are two issues with current writeback path in ext4.  For one we
6 don't necessarily map complete pages when blocksize < pagesize and
7 thus needn't do any writeback in one iteration.  We always map some
8 blocks though so we will eventually finish mapping the page.  Just if
9 writeback races with other operations on the file, forward progress is
10 not really guaranteed. The second problem is that current code
11 structure makes it hard to associate all the bios to some range of
12 pages with one io_end structure so that unwritten extents can be
13 converted after all the bios are finished.  This will be especially
14 difficult later when io_end will be associated with reserved
15 transaction handle.
17 We restructure the writeback path to a relatively simple loop which
18 first prepares extent of pages, then maps one or more extents so that
19 no page is partially mapped, and once page is fully mapped it is
20 submitted for IO. We keep all the mapping and IO submission
21 information in mpage_da_data structure to somewhat reduce stack usage.
22 Resulting code is somewhat shorter than the old one and hopefully also
23 easier to read.
25 Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
26 Signed-off-by: Jan Kara <jack@suse.cz>
27 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
28 ---
29  fs/ext4/ext4.h              |   15 -
30  fs/ext4/inode.c             | 1011 ++++++++++++++++++++++++++++++++----------------------------------
31  fs/ext4/page-io.c           |    4 -
32  include/trace/events/ext4.h |   64 +++--
33  4 files changed, 527 insertions(+), 567 deletions(-)
35 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
36 index 2ebfcde..90a164f 100644
37 --- a/fs/ext4/ext4.h
38 +++ b/fs/ext4/ext4.h
39 @@ -177,21 +177,6 @@ struct ext4_map_blocks {
40  };
42  /*
43 - * For delayed allocation tracking
44 - */
45 -struct mpage_da_data {
46 -       struct inode *inode;
47 -       sector_t b_blocknr;             /* start block number of extent */
48 -       size_t b_size;                  /* size of extent */
49 -       unsigned long b_state;          /* state of the extent */
50 -       unsigned long first_page, next_page;    /* extent of pages */
51 -       struct writeback_control *wbc;
52 -       int io_done;
53 -       int pages_written;
54 -       int retval;
55 -};
57 -/*
58   * Flags for ext4_io_end->flags
59   */
60  #define        EXT4_IO_END_UNWRITTEN   0x0001
61 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
62 index 2b777e5..5939a47 100644
63 --- a/fs/ext4/inode.c
64 +++ b/fs/ext4/inode.c
65 @@ -1407,149 +1407,42 @@ static void ext4_da_page_release_reservation(struct page *page,
66   * Delayed allocation stuff
67   */
69 -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
71 -/*
72 - * mpage_da_submit_io - walks through extent of pages and try to write
73 - * them with writepage() call back
74 - *
75 - * @mpd->inode: inode
76 - * @mpd->first_page: first page of the extent
77 - * @mpd->next_page: page after the last page of the extent
78 - *
79 - * By the time mpage_da_submit_io() is called we expect all blocks
80 - * to be allocated. this may be wrong if allocation failed.
81 - *
82 - * As pages are already locked by write_cache_pages(), we can't use it
83 - */
84 -static int mpage_da_submit_io(struct mpage_da_data *mpd,
85 -                             struct ext4_map_blocks *map)
87 -       struct pagevec pvec;
88 -       unsigned long index, end;
89 -       int ret = 0, err, nr_pages, i;
90 -       struct inode *inode = mpd->inode;
91 -       struct address_space *mapping = inode->i_mapping;
92 -       loff_t size = i_size_read(inode);
93 -       unsigned int len, block_start;
94 -       struct buffer_head *bh, *page_bufs = NULL;
95 -       sector_t pblock = 0, cur_logical = 0;
96 -       struct ext4_io_submit io_submit;
98 -       BUG_ON(mpd->next_page <= mpd->first_page);
99 -       ext4_io_submit_init(&io_submit, mpd->wbc);
100 -       io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
101 -       if (!io_submit.io_end) {
102 -               ext4_da_block_invalidatepages(mpd);
103 -               return -ENOMEM;
104 -       }
105 +struct mpage_da_data {
106 +       struct inode *inode;
107 +       struct writeback_control *wbc;
108 +       pgoff_t first_page;     /* The first page to write */
109 +       pgoff_t next_page;      /* Current page to examine */
110 +       pgoff_t last_page;      /* Last page to examine */
111         /*
112 -        * We need to start from the first_page to the next_page - 1
113 -        * to make sure we also write the mapped dirty buffer_heads.
114 -        * If we look at mpd->b_blocknr we would only be looking
115 -        * at the currently mapped buffer_heads.
116 +        * Extent to map - this can be after first_page because that can be
117 +        * fully mapped. We somewhat abuse m_flags to store whether the extent
118 +        * is delalloc or unwritten.
119          */
120 -       index = mpd->first_page;
121 -       end = mpd->next_page - 1;
123 -       pagevec_init(&pvec, 0);
124 -       while (index <= end) {
125 -               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
126 -               if (nr_pages == 0)
127 -                       break;
128 -               for (i = 0; i < nr_pages; i++) {
129 -                       int skip_page = 0;
130 -                       struct page *page = pvec.pages[i];
132 -                       index = page->index;
133 -                       if (index > end)
134 -                               break;
136 -                       if (index == size >> PAGE_CACHE_SHIFT)
137 -                               len = size & ~PAGE_CACHE_MASK;
138 -                       else
139 -                               len = PAGE_CACHE_SIZE;
140 -                       if (map) {
141 -                               cur_logical = index << (PAGE_CACHE_SHIFT -
142 -                                                       inode->i_blkbits);
143 -                               pblock = map->m_pblk + (cur_logical -
144 -                                                       map->m_lblk);
145 -                       }
146 -                       index++;
148 -                       BUG_ON(!PageLocked(page));
149 -                       BUG_ON(PageWriteback(page));
151 -                       bh = page_bufs = page_buffers(page);
152 -                       block_start = 0;
153 -                       do {
154 -                               if (map && (cur_logical >= map->m_lblk) &&
155 -                                   (cur_logical <= (map->m_lblk +
156 -                                                    (map->m_len - 1)))) {
157 -                                       if (buffer_delay(bh)) {
158 -                                               clear_buffer_delay(bh);
159 -                                               bh->b_blocknr = pblock;
160 -                                       }
161 -                                       if (buffer_unwritten(bh) ||
162 -                                           buffer_mapped(bh))
163 -                                               BUG_ON(bh->b_blocknr != pblock);
164 -                                       if (map->m_flags & EXT4_MAP_UNINIT)
165 -                                               set_buffer_uninit(bh);
166 -                                       clear_buffer_unwritten(bh);
167 -                               }
169 -                               /*
170 -                                * skip page if block allocation undone and
171 -                                * block is dirty
172 -                                */
173 -                               if (ext4_bh_delay_or_unwritten(NULL, bh))
174 -                                       skip_page = 1;
175 -                               bh = bh->b_this_page;
176 -                               block_start += bh->b_size;
177 -                               cur_logical++;
178 -                               pblock++;
179 -                       } while (bh != page_bufs);
181 -                       if (skip_page) {
182 -                               unlock_page(page);
183 -                               continue;
184 -                       }
186 -                       clear_page_dirty_for_io(page);
187 -                       err = ext4_bio_write_page(&io_submit, page, len,
188 -                                                 mpd->wbc);
189 -                       if (!err)
190 -                               mpd->pages_written++;
191 -                       /*
192 -                        * In error case, we have to continue because
193 -                        * remaining pages are still locked
194 -                        */
195 -                       if (ret == 0)
196 -                               ret = err;
197 -               }
198 -               pagevec_release(&pvec);
199 -       }
200 -       ext4_io_submit(&io_submit);
201 -       /* Drop io_end reference we got from init */
202 -       ext4_put_io_end_defer(io_submit.io_end);
203 -       return ret;
205 +       struct ext4_map_blocks map;
206 +       struct ext4_io_submit io_submit;        /* IO submission data */
209 -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
210 +static void mpage_release_unused_pages(struct mpage_da_data *mpd,
211 +                                      bool invalidate)
213         int nr_pages, i;
214         pgoff_t index, end;
215         struct pagevec pvec;
216         struct inode *inode = mpd->inode;
217         struct address_space *mapping = inode->i_mapping;
218 -       ext4_lblk_t start, last;
220 +       /* This is necessary when next_page == 0. */
221 +       if (mpd->first_page >= mpd->next_page)
222 +               return;
224         index = mpd->first_page;
225         end   = mpd->next_page - 1;
227 -       start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
228 -       last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
229 -       ext4_es_remove_extent(inode, start, last - start + 1);
230 +       if (invalidate) {
231 +               ext4_lblk_t start, last;
232 +               start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
233 +               last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
234 +               ext4_es_remove_extent(inode, start, last - start + 1);
235 +       }
237         pagevec_init(&pvec, 0);
238         while (index <= end) {
239 @@ -1562,14 +1455,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
240                                 break;
241                         BUG_ON(!PageLocked(page));
242                         BUG_ON(PageWriteback(page));
243 -                       block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
244 -                       ClearPageUptodate(page);
245 +                       if (invalidate) {
246 +                               block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
247 +                               ClearPageUptodate(page);
248 +                       }
249                         unlock_page(page);
250                 }
251                 index = pvec.pages[nr_pages - 1]->index + 1;
252                 pagevec_release(&pvec);
253         }
254 -       return;
257  static void ext4_print_free_blocks(struct inode *inode)
258 @@ -1598,215 +1492,6 @@ static void ext4_print_free_blocks(struct inode *inode)
259         return;
263 - * mpage_da_map_and_submit - go through given space, map them
264 - *       if necessary, and then submit them for I/O
265 - *
266 - * @mpd - bh describing space
267 - *
268 - * The function skips space we know is already mapped to disk blocks.
269 - *
270 - */
271 -static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
273 -       int err, blks, get_blocks_flags;
274 -       struct ext4_map_blocks map, *mapp = NULL;
275 -       sector_t next = mpd->b_blocknr;
276 -       unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
277 -       loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
278 -       handle_t *handle = NULL;
280 -       /*
281 -        * If the blocks are mapped already, or we couldn't accumulate
282 -        * any blocks, then proceed immediately to the submission stage.
283 -        */
284 -       if ((mpd->b_size == 0) ||
285 -           ((mpd->b_state  & (1 << BH_Mapped)) &&
286 -            !(mpd->b_state & (1 << BH_Delay)) &&
287 -            !(mpd->b_state & (1 << BH_Unwritten))))
288 -               goto submit_io;
290 -       handle = ext4_journal_current_handle();
291 -       BUG_ON(!handle);
293 -       /*
294 -        * Call ext4_map_blocks() to allocate any delayed allocation
295 -        * blocks, or to convert an uninitialized extent to be
296 -        * initialized (in the case where we have written into
297 -        * one or more preallocated blocks).
298 -        *
299 -        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
300 -        * indicate that we are on the delayed allocation path.  This
301 -        * affects functions in many different parts of the allocation
302 -        * call path.  This flag exists primarily because we don't
303 -        * want to change *many* call functions, so ext4_map_blocks()
304 -        * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
305 -        * inode's allocation semaphore is taken.
306 -        *
307 -        * If the blocks in questions were delalloc blocks, set
308 -        * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
309 -        * variables are updated after the blocks have been allocated.
310 -        */
311 -       map.m_lblk = next;
312 -       map.m_len = max_blocks;
313 -       /*
314 -        * We're in delalloc path and it is possible that we're going to
315 -        * need more metadata blocks than previously reserved. However
316 -        * we must not fail because we're in writeback and there is
317 -        * nothing we can do about it so it might result in data loss.
318 -        * So use reserved blocks to allocate metadata if possible.
319 -        */
320 -       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
321 -                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
322 -       if (ext4_should_dioread_nolock(mpd->inode))
323 -               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
324 -       if (mpd->b_state & (1 << BH_Delay))
325 -               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
328 -       blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
329 -       if (blks < 0) {
330 -               struct super_block *sb = mpd->inode->i_sb;
332 -               err = blks;
333 -               /*
334 -                * If get block returns EAGAIN or ENOSPC and there
335 -                * appears to be free blocks we will just let
336 -                * mpage_da_submit_io() unlock all of the pages.
337 -                */
338 -               if (err == -EAGAIN)
339 -                       goto submit_io;
341 -               if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
342 -                       mpd->retval = err;
343 -                       goto submit_io;
344 -               }
346 -               /*
347 -                * get block failure will cause us to loop in
348 -                * writepages, because a_ops->writepage won't be able
349 -                * to make progress. The page will be redirtied by
350 -                * writepage and writepages will again try to write
351 -                * the same.
352 -                */
353 -               if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
354 -                       ext4_msg(sb, KERN_CRIT,
355 -                                "delayed block allocation failed for inode %lu "
356 -                                "at logical offset %llu with max blocks %zd "
357 -                                "with error %d", mpd->inode->i_ino,
358 -                                (unsigned long long) next,
359 -                                mpd->b_size >> mpd->inode->i_blkbits, err);
360 -                       ext4_msg(sb, KERN_CRIT,
361 -                               "This should not happen!! Data will be lost");
362 -                       if (err == -ENOSPC)
363 -                               ext4_print_free_blocks(mpd->inode);
364 -               }
365 -               /* invalidate all the pages */
366 -               ext4_da_block_invalidatepages(mpd);
368 -               /* Mark this page range as having been completed */
369 -               mpd->io_done = 1;
370 -               return;
371 -       }
372 -       BUG_ON(blks == 0);
374 -       mapp = &map;
375 -       if (map.m_flags & EXT4_MAP_NEW) {
376 -               struct block_device *bdev = mpd->inode->i_sb->s_bdev;
377 -               int i;
379 -               for (i = 0; i < map.m_len; i++)
380 -                       unmap_underlying_metadata(bdev, map.m_pblk + i);
381 -       }
383 -       /*
384 -        * Update on-disk size along with block allocation.
385 -        */
386 -       disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
387 -       if (disksize > i_size_read(mpd->inode))
388 -               disksize = i_size_read(mpd->inode);
389 -       if (disksize > EXT4_I(mpd->inode)->i_disksize) {
390 -               ext4_update_i_disksize(mpd->inode, disksize);
391 -               err = ext4_mark_inode_dirty(handle, mpd->inode);
392 -               if (err)
393 -                       ext4_error(mpd->inode->i_sb,
394 -                                  "Failed to mark inode %lu dirty",
395 -                                  mpd->inode->i_ino);
396 -       }
398 -submit_io:
399 -       mpage_da_submit_io(mpd, mapp);
400 -       mpd->io_done = 1;
403 -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
404 -               (1 << BH_Delay) | (1 << BH_Unwritten))
407 - * mpage_add_bh_to_extent - try to add one more block to extent of blocks
408 - *
409 - * @mpd->lbh - extent of blocks
410 - * @logical - logical number of the block in the file
411 - * @b_state - b_state of the buffer head added
412 - *
413 - * the function is used to collect contig. blocks in same state
414 - */
415 -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
416 -                                  unsigned long b_state)
418 -       sector_t next;
419 -       int blkbits = mpd->inode->i_blkbits;
420 -       int nrblocks = mpd->b_size >> blkbits;
422 -       /*
423 -        * XXX Don't go larger than mballoc is willing to allocate
424 -        * This is a stopgap solution.  We eventually need to fold
425 -        * mpage_da_submit_io() into this function and then call
426 -        * ext4_map_blocks() multiple times in a loop
427 -        */
428 -       if (nrblocks >= (8*1024*1024 >> blkbits))
429 -               goto flush_it;
431 -       /* check if the reserved journal credits might overflow */
432 -       if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
433 -               if (nrblocks >= EXT4_MAX_TRANS_DATA) {
434 -                       /*
435 -                        * With non-extent format we are limited by the journal
436 -                        * credit available.  Total credit needed to insert
437 -                        * nrblocks contiguous blocks is dependent on the
438 -                        * nrblocks.  So limit nrblocks.
439 -                        */
440 -                       goto flush_it;
441 -               }
442 -       }
443 -       /*
444 -        * First block in the extent
445 -        */
446 -       if (mpd->b_size == 0) {
447 -               mpd->b_blocknr = logical;
448 -               mpd->b_size = 1 << blkbits;
449 -               mpd->b_state = b_state & BH_FLAGS;
450 -               return;
451 -       }
453 -       next = mpd->b_blocknr + nrblocks;
454 -       /*
455 -        * Can we merge the block to our big extent?
456 -        */
457 -       if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
458 -               mpd->b_size += 1 << blkbits;
459 -               return;
460 -       }
462 -flush_it:
463 -       /*
464 -        * We couldn't merge the block to our extent, so we
465 -        * need to flush current  extent and start new one
466 -        */
467 -       mpage_da_map_and_submit(mpd);
468 -       return;
471  static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
473         return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
474 @@ -2204,6 +1889,8 @@ static int ext4_writepage(struct page *page,
475         return ret;
478 +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
480  /*
481   * mballoc gives us at most this number of blocks...
482   * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
483 @@ -2212,6 +1899,315 @@ static int ext4_writepage(struct page *page,
484  #define MAX_WRITEPAGES_EXTENT_LEN 2048
486  /*
487 + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
488 + *
489 + * @mpd - extent of blocks
490 + * @lblk - logical number of the block in the file
491 + * @b_state - b_state of the buffer head added
492 + *
493 + * the function is used to collect contig. blocks in same state
494 + */
495 +static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
496 +                                 unsigned long b_state)
498 +       struct ext4_map_blocks *map = &mpd->map;
500 +       /* Don't go larger than mballoc is willing to allocate */
501 +       if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
502 +               return 0;
504 +       /* First block in the extent? */
505 +       if (map->m_len == 0) {
506 +               map->m_lblk = lblk;
507 +               map->m_len = 1;
508 +               map->m_flags = b_state & BH_FLAGS;
509 +               return 1;
510 +       }
512 +       /* Can we merge the block to our big extent? */
513 +       if (lblk == map->m_lblk + map->m_len &&
514 +           (b_state & BH_FLAGS) == map->m_flags) {
515 +               map->m_len++;
516 +               return 1;
517 +       }
518 +       return 0;
521 +static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
522 +                                   struct buffer_head *head,
523 +                                   struct buffer_head *bh,
524 +                                   ext4_lblk_t lblk)
526 +       struct inode *inode = mpd->inode;
527 +       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
528 +                                                       >> inode->i_blkbits;
530 +       do {
531 +               BUG_ON(buffer_locked(bh));
533 +               if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
534 +                   (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
535 +                   lblk >= blocks) {
536 +                       /* Found extent to map? */
537 +                       if (mpd->map.m_len)
538 +                               return false;
539 +                       if (lblk >= blocks)
540 +                               return true;
541 +                       continue;
542 +               }
543 +               if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
544 +                       return false;
545 +       } while (lblk++, (bh = bh->b_this_page) != head);
546 +       return true;
549 +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
551 +       int len;
552 +       loff_t size = i_size_read(mpd->inode);
553 +       int err;
555 +       BUG_ON(page->index != mpd->first_page);
556 +       if (page->index == size >> PAGE_CACHE_SHIFT)
557 +               len = size & ~PAGE_CACHE_MASK;
558 +       else
559 +               len = PAGE_CACHE_SIZE;
560 +       clear_page_dirty_for_io(page);
561 +       err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
562 +       if (!err)
563 +               mpd->wbc->nr_to_write--;
564 +       mpd->first_page++;
566 +       return err;
570 + * mpage_map_buffers - update buffers corresponding to changed extent and
571 + *                    submit fully mapped pages for IO
572 + *
573 + * @mpd - description of extent to map, on return next extent to map
574 + *
575 + * Scan buffers corresponding to changed extent (we expect corresponding pages
576 + * to be already locked) and update buffer state according to new extent state.
577 + * We map delalloc buffers to their physical location, clear unwritten bits,
578 + * and mark buffers as uninit when we perform writes to uninitialized extents
579 + * and do extent conversion after IO is finished. If the last page is not fully
580 + * mapped, we update @map to the next extent in the last page that needs
581 + * mapping. Otherwise we submit the page for IO.
582 + */
583 +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
585 +       struct pagevec pvec;
586 +       int nr_pages, i;
587 +       struct inode *inode = mpd->inode;
588 +       struct buffer_head *head, *bh;
589 +       int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
590 +       ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
591 +                                                       >> inode->i_blkbits;
592 +       pgoff_t start, end;
593 +       ext4_lblk_t lblk;
594 +       sector_t pblock;
595 +       int err;
597 +       start = mpd->map.m_lblk >> bpp_bits;
598 +       end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
599 +       lblk = start << bpp_bits;
600 +       pblock = mpd->map.m_pblk;
602 +       pagevec_init(&pvec, 0);
603 +       while (start <= end) {
604 +               nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
605 +                                         PAGEVEC_SIZE);
606 +               if (nr_pages == 0)
607 +                       break;
608 +               for (i = 0; i < nr_pages; i++) {
609 +                       struct page *page = pvec.pages[i];
611 +                       if (page->index > end)
612 +                               break;
613 +                       /* Upto 'end' pages must be contiguous */
614 +                       BUG_ON(page->index != start);
615 +                       bh = head = page_buffers(page);
616 +                       do {
617 +                               if (lblk < mpd->map.m_lblk)
618 +                                       continue;
619 +                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
620 +                                       /*
621 +                                        * Buffer after end of mapped extent.
622 +                                        * Find next buffer in the page to map.
623 +                                        */
624 +                                       mpd->map.m_len = 0;
625 +                                       mpd->map.m_flags = 0;
626 +                                       add_page_bufs_to_extent(mpd, head, bh,
627 +                                                               lblk);
628 +                                       pagevec_release(&pvec);
629 +                                       return 0;
630 +                               }
631 +                               if (buffer_delay(bh)) {
632 +                                       clear_buffer_delay(bh);
633 +                                       bh->b_blocknr = pblock++;
634 +                               }
635 +                               if (mpd->map.m_flags & EXT4_MAP_UNINIT)
636 +                                       set_buffer_uninit(bh);
637 +                               clear_buffer_unwritten(bh);
638 +                       } while (++lblk < blocks &&
639 +                                (bh = bh->b_this_page) != head);
641 +                       /*
642 +                        * FIXME: This is going to break if dioread_nolock
643 +                        * supports blocksize < pagesize as we will try to
644 +                        * convert potentially unmapped parts of inode.
645 +                        */
646 +                       mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
647 +                       /* Page fully mapped - let IO run! */
648 +                       err = mpage_submit_page(mpd, page);
649 +                       if (err < 0) {
650 +                               pagevec_release(&pvec);
651 +                               return err;
652 +                       }
653 +                       start++;
654 +               }
655 +               pagevec_release(&pvec);
656 +       }
657 +       /* Extent fully mapped and matches with page boundary. We are done. */
658 +       mpd->map.m_len = 0;
659 +       mpd->map.m_flags = 0;
660 +       return 0;
663 +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
665 +       struct inode *inode = mpd->inode;
666 +       struct ext4_map_blocks *map = &mpd->map;
667 +       int get_blocks_flags;
668 +       int err;
670 +       trace_ext4_da_write_pages_extent(inode, map);
671 +       /*
672 +        * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
673 +        * to convert an uninitialized extent to be initialized (in the case
674 +        * where we have written into one or more preallocated blocks).  It is
675 +        * possible that we're going to need more metadata blocks than
676 +        * previously reserved. However we must not fail because we're in
677 +        * writeback and there is nothing we can do about it so it might result
678 +        * in data loss.  So use reserved blocks to allocate metadata if
679 +        * possible.
680 +        *
681 +        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
682 +        * in question are delalloc blocks.  This affects functions in many
683 +        * different parts of the allocation call path.  This flag exists
684 +        * primarily because we don't want to change *many* call functions, so
685 +        * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
686 +        * once the inode's allocation semaphore is taken.
687 +        */
688 +       get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
689 +                          EXT4_GET_BLOCKS_METADATA_NOFAIL;
690 +       if (ext4_should_dioread_nolock(inode))
691 +               get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
692 +       if (map->m_flags & (1 << BH_Delay))
693 +               get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
695 +       err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
696 +       if (err < 0)
697 +               return err;
699 +       BUG_ON(map->m_len == 0);
700 +       if (map->m_flags & EXT4_MAP_NEW) {
701 +               struct block_device *bdev = inode->i_sb->s_bdev;
702 +               int i;
704 +               for (i = 0; i < map->m_len; i++)
705 +                       unmap_underlying_metadata(bdev, map->m_pblk + i);
706 +       }
707 +       return 0;
711 + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
712 + *                              mpd->len and submit pages underlying it for IO
713 + *
714 + * @handle - handle for journal operations
715 + * @mpd - extent to map
716 + *
717 + * The function maps extent starting at mpd->lblk of length mpd->len. If it is
718 + * delayed, blocks are allocated, if it is unwritten, we may need to convert
719 + * them to initialized or split the described range from larger unwritten
720 + * extent. Note that we need not map all the described range since allocation
721 + * can return less blocks or the range is covered by more unwritten extents. We
722 + * cannot map more because we are limited by reserved transaction credits. On
723 + * the other hand we always make sure that the last touched page is fully
724 + * mapped so that it can be written out (and thus forward progress is
725 + * guaranteed). After mapping we submit all mapped pages for IO.
726 + */
727 +static int mpage_map_and_submit_extent(handle_t *handle,
728 +                                      struct mpage_da_data *mpd)
730 +       struct inode *inode = mpd->inode;
731 +       struct ext4_map_blocks *map = &mpd->map;
732 +       int err;
733 +       loff_t disksize;
735 +       mpd->io_submit.io_end->offset =
736 +                               ((loff_t)map->m_lblk) << inode->i_blkbits;
737 +       while (map->m_len) {
738 +               err = mpage_map_one_extent(handle, mpd);
739 +               if (err < 0) {
740 +                       struct super_block *sb = inode->i_sb;
742 +                       /*
743 +                        * Need to commit transaction to free blocks. Let upper
744 +                        * layers sort it out.
745 +                        */
746 +                       if (err == -ENOSPC && ext4_count_free_clusters(sb))
747 +                               return -ENOSPC;
749 +                       if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
750 +                               ext4_msg(sb, KERN_CRIT,
751 +                                        "Delayed block allocation failed for "
752 +                                        "inode %lu at logical offset %llu with"
753 +                                        " max blocks %u with error %d",
754 +                                        inode->i_ino,
755 +                                        (unsigned long long)map->m_lblk,
756 +                                        (unsigned)map->m_len, err);
757 +                               ext4_msg(sb, KERN_CRIT,
758 +                                        "This should not happen!! Data will "
759 +                                        "be lost\n");
760 +                               if (err == -ENOSPC)
761 +                                       ext4_print_free_blocks(inode);
762 +                       }
763 +                       /* invalidate all the pages */
764 +                       mpage_release_unused_pages(mpd, true);
765 +                       return err;
766 +               }
767 +               /*
768 +                * Update buffer state, submit mapped pages, and get us new
769 +                * extent to map
770 +                */
771 +               err = mpage_map_and_submit_buffers(mpd);
772 +               if (err < 0)
773 +                       return err;
774 +       }
776 +       /* Update on-disk size after IO is submitted */
777 +       disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
778 +       if (disksize > i_size_read(inode))
779 +               disksize = i_size_read(inode);
780 +       if (disksize > EXT4_I(inode)->i_disksize) {
781 +               int err2;
783 +               ext4_update_i_disksize(inode, disksize);
784 +               err2 = ext4_mark_inode_dirty(handle, inode);
785 +               if (err2)
786 +                       ext4_error(inode->i_sb,
787 +                                  "Failed to mark inode %lu dirty",
788 +                                  inode->i_ino);
789 +               if (!err)
790 +                       err = err2;
791 +       }
792 +       return err;
796   * Calculate the total number of credits to reserve for one writepages
797   * iteration. This is called from ext4_da_writepages(). We map an extent of
798   * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
799 @@ -2227,44 +2223,49 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
802  /*
803 - * write_cache_pages_da - walk the list of dirty pages of the given
804 - * address space and accumulate pages that need writing, and call
805 - * mpage_da_map_and_submit to map a single contiguous memory region
806 - * and then write them.
807 + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
808 + *                              and underlying extent to map
809 + *
810 + * @mpd - where to look for pages
811 + *
812 + * Walk dirty pages in the mapping. If they are fully mapped, submit them for
813 + * IO immediately. When we find a page which isn't mapped we start accumulating
814 + * extent of buffers underlying these pages that needs mapping (formed by
815 + * either delayed or unwritten buffers). We also lock the pages containing
816 + * these buffers. The extent found is returned in @mpd structure (starting at
817 + * mpd->lblk with length mpd->len blocks).
818 + *
819 + * Note that this function can attach bios to one io_end structure which are
820 + * neither logically nor physically contiguous. Although it may seem as an
821 + * unnecessary complication, it is actually inevitable in blocksize < pagesize
822 + * case as we need to track IO to all buffers underlying a page in one io_end.
823   */
824 -static int write_cache_pages_da(handle_t *handle,
825 -                               struct address_space *mapping,
826 -                               struct writeback_control *wbc,
827 -                               struct mpage_da_data *mpd,
828 -                               pgoff_t *done_index)
829 +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
831 -       struct buffer_head      *bh, *head;
832 -       struct inode            *inode = mapping->host;
833 -       struct pagevec          pvec;
834 -       unsigned int            nr_pages;
835 -       sector_t                logical;
836 -       pgoff_t                 index, end;
837 -       long                    nr_to_write = wbc->nr_to_write;
838 -       int                     i, tag, ret = 0;
840 -       memset(mpd, 0, sizeof(struct mpage_da_data));
841 -       mpd->wbc = wbc;
842 -       mpd->inode = inode;
843 -       pagevec_init(&pvec, 0);
844 -       index = wbc->range_start >> PAGE_CACHE_SHIFT;
845 -       end = wbc->range_end >> PAGE_CACHE_SHIFT;
846 +       struct address_space *mapping = mpd->inode->i_mapping;
847 +       struct pagevec pvec;
848 +       unsigned int nr_pages;
849 +       pgoff_t index = mpd->first_page;
850 +       pgoff_t end = mpd->last_page;
851 +       int tag;
852 +       int i, err = 0;
853 +       int blkbits = mpd->inode->i_blkbits;
854 +       ext4_lblk_t lblk;
855 +       struct buffer_head *head;
857 -       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
858 +       if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
859                 tag = PAGECACHE_TAG_TOWRITE;
860         else
861                 tag = PAGECACHE_TAG_DIRTY;
863 -       *done_index = index;
864 +       pagevec_init(&pvec, 0);
865 +       mpd->map.m_len = 0;
866 +       mpd->next_page = index;
867         while (index <= end) {
868                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
869                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
870                 if (nr_pages == 0)
871 -                       return 0;
872 +                       goto out;
874                 for (i = 0; i < nr_pages; i++) {
875                         struct page *page = pvec.pages[i];
876 @@ -2279,31 +2280,21 @@ static int write_cache_pages_da(handle_t *handle,
877                         if (page->index > end)
878                                 goto out;
880 -                       *done_index = page->index + 1;
882 -                       /*
883 -                        * If we can't merge this page, and we have
884 -                        * accumulated an contiguous region, write it
885 -                        */
886 -                       if ((mpd->next_page != page->index) &&
887 -                           (mpd->next_page != mpd->first_page)) {
888 -                               mpage_da_map_and_submit(mpd);
889 -                               goto ret_extent_tail;
890 -                       }
891 +                       /* If we can't merge this page, we are done. */
892 +                       if (mpd->map.m_len > 0 && mpd->next_page != page->index)
893 +                               goto out;
895                         lock_page(page);
897                         /*
898 -                        * If the page is no longer dirty, or its
899 -                        * mapping no longer corresponds to inode we
900 -                        * are writing (which means it has been
901 -                        * truncated or invalidated), or the page is
902 -                        * already under writeback and we are not
903 -                        * doing a data integrity writeback, skip the page
904 +                        * If the page is no longer dirty, or its mapping no
905 +                        * longer corresponds to inode we are writing (which
906 +                        * means it has been truncated or invalidated), or the
907 +                        * page is already under writeback and we are not doing
908 +                        * a data integrity writeback, skip the page
909                          */
910                         if (!PageDirty(page) ||
911                             (PageWriteback(page) &&
912 -                            (wbc->sync_mode == WB_SYNC_NONE)) ||
913 +                            (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
914                             unlikely(page->mapping != mapping)) {
915                                 unlock_page(page);
916                                 continue;
917 @@ -2312,101 +2303,57 @@ static int write_cache_pages_da(handle_t *handle,
918                         wait_on_page_writeback(page);
919                         BUG_ON(PageWriteback(page));
921 -                       /*
922 -                        * If we have inline data and arrive here, it means that
923 -                        * we will soon create the block for the 1st page, so
924 -                        * we'd better clear the inline data here.
925 -                        */
926 -                       if (ext4_has_inline_data(inode)) {
927 -                               BUG_ON(ext4_test_inode_state(inode,
928 -                                               EXT4_STATE_MAY_INLINE_DATA));
929 -                               ext4_destroy_inline_data(handle, inode);
930 -                       }
932 -                       if (mpd->next_page != page->index)
933 +                       if (mpd->map.m_len == 0)
934                                 mpd->first_page = page->index;
935                         mpd->next_page = page->index + 1;
936 -                       logical = (sector_t) page->index <<
937 -                               (PAGE_CACHE_SHIFT - inode->i_blkbits);
939                         /* Add all dirty buffers to mpd */
940 +                       lblk = ((ext4_lblk_t)page->index) <<
941 +                               (PAGE_CACHE_SHIFT - blkbits);
942                         head = page_buffers(page);
943 -                       bh = head;
944 -                       do {
945 -                               BUG_ON(buffer_locked(bh));
946 -                               /*
947 -                                * We need to try to allocate unmapped blocks
948 -                                * in the same page.  Otherwise we won't make
949 -                                * progress with the page in ext4_writepage
950 -                                */
951 -                               if (ext4_bh_delay_or_unwritten(NULL, bh)) {
952 -                                       mpage_add_bh_to_extent(mpd, logical,
953 -                                                              bh->b_state);
954 -                                       if (mpd->io_done)
955 -                                               goto ret_extent_tail;
956 -                               } else if (buffer_dirty(bh) &&
957 -                                          buffer_mapped(bh)) {
958 -                                       /*
959 -                                        * mapped dirty buffer. We need to
960 -                                        * update the b_state because we look
961 -                                        * at b_state in mpage_da_map_blocks.
962 -                                        * We don't update b_size because if we
963 -                                        * find an unmapped buffer_head later
964 -                                        * we need to use the b_state flag of
965 -                                        * that buffer_head.
966 -                                        */
967 -                                       if (mpd->b_size == 0)
968 -                                               mpd->b_state =
969 -                                                       bh->b_state & BH_FLAGS;
970 -                               }
971 -                               logical++;
972 -                       } while ((bh = bh->b_this_page) != head);
974 -                       if (nr_to_write > 0) {
975 -                               nr_to_write--;
976 -                               if (nr_to_write == 0 &&
977 -                                   wbc->sync_mode == WB_SYNC_NONE)
978 -                                       /*
979 -                                        * We stop writing back only if we are
980 -                                        * not doing integrity sync. In case of
981 -                                        * integrity sync we have to keep going
982 -                                        * because someone may be concurrently
983 -                                        * dirtying pages, and we might have
984 -                                        * synced a lot of newly appeared dirty
985 -                                        * pages, but have not synced all of the
986 -                                        * old dirty pages.
987 -                                        */
988 +                       if (!add_page_bufs_to_extent(mpd, head, head, lblk))
989 +                               goto out;
990 +                       /* So far everything mapped? Submit the page for IO. */
991 +                       if (mpd->map.m_len == 0) {
992 +                               err = mpage_submit_page(mpd, page);
993 +                               if (err < 0)
994                                         goto out;
995                         }
997 +                       /*
998 +                        * Accumulated enough dirty pages? This doesn't apply
999 +                        * to WB_SYNC_ALL mode. For integrity sync we have to
1000 +                        * keep going because someone may be concurrently
1001 +                        * dirtying pages, and we might have synced a lot of
1002 +                        * newly appeared dirty pages, but have not synced all
1003 +                        * of the old dirty pages.
1004 +                        */
1005 +                       if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
1006 +                           mpd->next_page - mpd->first_page >=
1007 +                                                       mpd->wbc->nr_to_write)
1008 +                               goto out;
1009                 }
1010                 pagevec_release(&pvec);
1011                 cond_resched();
1012         }
1013         return 0;
1014 -ret_extent_tail:
1015 -       ret = MPAGE_DA_EXTENT_TAIL;
1016  out:
1017         pagevec_release(&pvec);
1018 -       cond_resched();
1019 -       return ret;
1020 +       return err;
1024  static int ext4_da_writepages(struct address_space *mapping,
1025                               struct writeback_control *wbc)
1027 -       pgoff_t index;
1028 +       pgoff_t writeback_index = 0;
1029 +       long nr_to_write = wbc->nr_to_write;
1030         int range_whole = 0;
1031 +       int cycled = 1;
1032         handle_t *handle = NULL;
1033         struct mpage_da_data mpd;
1034         struct inode *inode = mapping->host;
1035 -       int pages_written = 0;
1036 -       int range_cyclic, cycled = 1, io_done = 0;
1037         int needed_blocks, ret = 0;
1038 -       loff_t range_start = wbc->range_start;
1039         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
1040 -       pgoff_t done_index = 0;
1041 -       pgoff_t end;
1042 +       bool done;
1043         struct blk_plug plug;
1045         trace_ext4_da_writepages(inode, wbc);
1046 @@ -2432,40 +2379,65 @@ static int ext4_da_writepages(struct address_space *mapping,
1047         if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
1048                 return -EROFS;
1050 +       /*
1051 +        * If we have inline data and arrive here, it means that
1052 +        * we will soon create the block for the 1st page, so
1053 +        * we'd better clear the inline data here.
1054 +        */
1055 +       if (ext4_has_inline_data(inode)) {
1056 +               /* Just inode will be modified... */
1057 +               handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
1058 +               if (IS_ERR(handle)) {
1059 +                       ret = PTR_ERR(handle);
1060 +                       goto out_writepages;
1061 +               }
1062 +               BUG_ON(ext4_test_inode_state(inode,
1063 +                               EXT4_STATE_MAY_INLINE_DATA));
1064 +               ext4_destroy_inline_data(handle, inode);
1065 +               ext4_journal_stop(handle);
1066 +       }
1068         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
1069                 range_whole = 1;
1071 -       range_cyclic = wbc->range_cyclic;
1072         if (wbc->range_cyclic) {
1073 -               index = mapping->writeback_index;
1074 -               if (index)
1075 +               writeback_index = mapping->writeback_index;
1076 +               if (writeback_index)
1077                         cycled = 0;
1078 -               wbc->range_start = index << PAGE_CACHE_SHIFT;
1079 -               wbc->range_end  = LLONG_MAX;
1080 -               wbc->range_cyclic = 0;
1081 -               end = -1;
1082 +               mpd.first_page = writeback_index;
1083 +               mpd.last_page = -1;
1084         } else {
1085 -               index = wbc->range_start >> PAGE_CACHE_SHIFT;
1086 -               end = wbc->range_end >> PAGE_CACHE_SHIFT;
1087 +               mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
1088 +               mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
1089         }
1091 +       mpd.inode = inode;
1092 +       mpd.wbc = wbc;
1093 +       ext4_io_submit_init(&mpd.io_submit, wbc);
1094  retry:
1095         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1096 -               tag_pages_for_writeback(mapping, index, end);
1098 +               tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
1099 +       done = false;
1100         blk_start_plug(&plug);
1101 -       while (!ret && wbc->nr_to_write > 0) {
1102 +       while (!done && mpd.first_page <= mpd.last_page) {
1103 +               /* For each extent of pages we use new io_end */
1104 +               mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
1105 +               if (!mpd.io_submit.io_end) {
1106 +                       ret = -ENOMEM;
1107 +                       break;
1108 +               }
1110                 /*
1111 -                * we  insert one extent at a time. So we need
1112 -                * credit needed for single extent allocation.
1113 -                * journalled mode is currently not supported
1114 -                * by delalloc
1115 +                * We have two constraints: We find one extent to map and we
1116 +                * must always write out whole page (makes a difference when
1117 +                * blocksize < pagesize) so that we don't block on IO when we
1118 +                * try to write out the rest of the page. Journalled mode is
1119 +                * not supported by delalloc.
1120                  */
1121                 BUG_ON(ext4_should_journal_data(inode));
1122                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
1124 -               /* start a new transaction*/
1125 +               /* start a new transaction */
1126                 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
1127                                             needed_blocks);
1128                 if (IS_ERR(handle)) {
1129 @@ -2473,76 +2445,67 @@ retry:
1130                         ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
1131                                "%ld pages, ino %lu; err %d", __func__,
1132                                 wbc->nr_to_write, inode->i_ino, ret);
1133 -                       blk_finish_plug(&plug);
1134 -                       goto out_writepages;
1135 +                       /* Release allocated io_end */
1136 +                       ext4_put_io_end(mpd.io_submit.io_end);
1137 +                       break;
1138                 }
1140 -               /*
1141 -                * Now call write_cache_pages_da() to find the next
1142 -                * contiguous region of logical blocks that need
1143 -                * blocks to be allocated by ext4 and submit them.
1144 -                */
1145 -               ret = write_cache_pages_da(handle, mapping,
1146 -                                          wbc, &mpd, &done_index);
1147 -               /*
1148 -                * If we have a contiguous extent of pages and we
1149 -                * haven't done the I/O yet, map the blocks and submit
1150 -                * them for I/O.
1151 -                */
1152 -               if (!mpd.io_done && mpd.next_page != mpd.first_page) {
1153 -                       mpage_da_map_and_submit(&mpd);
1154 -                       ret = MPAGE_DA_EXTENT_TAIL;
1155 +               trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
1156 +               ret = mpage_prepare_extent_to_map(&mpd);
1157 +               if (!ret) {
1158 +                       if (mpd.map.m_len)
1159 +                               ret = mpage_map_and_submit_extent(handle, &mpd);
1160 +                       else {
1161 +                               /*
1162 +                                * We scanned the whole range (or exhausted
1163 +                                * nr_to_write), submitted what was mapped and
1164 +                                * didn't find anything needing mapping. We are
1165 +                                * done.
1166 +                                */
1167 +                               done = true;
1168 +                       }
1169                 }
1170 -               trace_ext4_da_write_pages(inode, &mpd);
1171 -               wbc->nr_to_write -= mpd.pages_written;
1173                 ext4_journal_stop(handle);
1175 -               if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
1176 -                       /* commit the transaction which would
1177 +               /* Submit prepared bio */
1178 +               ext4_io_submit(&mpd.io_submit);
1179 +               /* Unlock pages we didn't use */
1180 +               mpage_release_unused_pages(&mpd, false);
1181 +               /* Drop our io_end reference we got from init */
1182 +               ext4_put_io_end(mpd.io_submit.io_end);
1184 +               if (ret == -ENOSPC && sbi->s_journal) {
1185 +                       /*
1186 +                        * Commit the transaction which would
1187                          * free blocks released in the transaction
1188                          * and try again
1189                          */
1190                         jbd2_journal_force_commit_nested(sbi->s_journal);
1191                         ret = 0;
1192 -               } else if (ret == MPAGE_DA_EXTENT_TAIL) {
1193 -                       /*
1194 -                        * Got one extent now try with rest of the pages.
1195 -                        * If mpd.retval is set -EIO, journal is aborted.
1196 -                        * So we don't need to write any more.
1197 -                        */
1198 -                       pages_written += mpd.pages_written;
1199 -                       ret = mpd.retval;
1200 -                       io_done = 1;
1201 -               } else if (wbc->nr_to_write)
1202 -                       /*
1203 -                        * There is no more writeout needed
1204 -                        * or we requested for a noblocking writeout
1205 -                        * and we found the device congested
1206 -                        */
1207 +                       continue;
1208 +               }
1209 +               /* Fatal error - ENOMEM, EIO... */
1210 +               if (ret)
1211                         break;
1212         }
1213         blk_finish_plug(&plug);
1214 -       if (!io_done && !cycled) {
1215 +       if (!ret && !cycled) {
1216                 cycled = 1;
1217 -               index = 0;
1218 -               wbc->range_start = index << PAGE_CACHE_SHIFT;
1219 -               wbc->range_end  = mapping->writeback_index - 1;
1220 +               mpd.last_page = writeback_index - 1;
1221 +               mpd.first_page = 0;
1222                 goto retry;
1223         }
1225         /* Update index */
1226 -       wbc->range_cyclic = range_cyclic;
1227         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1228                 /*
1229 -                * set the writeback_index so that range_cyclic
1230 +                * Set the writeback_index so that range_cyclic
1231                  * mode will write it back later
1232                  */
1233 -               mapping->writeback_index = done_index;
1234 +               mapping->writeback_index = mpd.first_page;
1236  out_writepages:
1237 -       wbc->range_start = range_start;
1238 -       trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
1239 +       trace_ext4_da_writepages_result(inode, wbc, ret,
1240 +                                       nr_to_write - wbc->nr_to_write);
1241         return ret;
1244 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
1245 index 19599bd..3e58546 100644
1246 --- a/fs/ext4/page-io.c
1247 +++ b/fs/ext4/page-io.c
1248 @@ -360,9 +360,6 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
1249         bio->bi_bdev = bh->b_bdev;
1250         bio->bi_end_io = ext4_end_bio;
1251         bio->bi_private = ext4_get_io_end(io->io_end);
1252 -       if (!io->io_end->size)
1253 -               io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
1254 -                                    + bh_offset(bh);
1255         io->io_bio = bio;
1256         io->io_next_block = bh->b_blocknr;
1257         return 0;
1258 @@ -390,7 +387,6 @@ submit_and_retry:
1259         io_end = io->io_end;
1260         if (test_clear_buffer_uninit(bh))
1261                 ext4_set_io_unwritten_flag(inode, io_end);
1262 -       io_end->size += bh->b_size;
1263         io->io_next_block++;
1264         return 0;
1266 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
1267 index e23b218..08a37a4 100644
1268 --- a/include/trace/events/ext4.h
1269 +++ b/include/trace/events/ext4.h
1270 @@ -324,43 +324,59 @@ TRACE_EVENT(ext4_da_writepages,
1271  );
1273  TRACE_EVENT(ext4_da_write_pages,
1274 -       TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
1275 +       TP_PROTO(struct inode *inode, pgoff_t first_page,
1276 +                struct writeback_control *wbc),
1278 -       TP_ARGS(inode, mpd),
1279 +       TP_ARGS(inode, first_page, wbc),
1281         TP_STRUCT__entry(
1282                 __field(        dev_t,  dev                     )
1283                 __field(        ino_t,  ino                     )
1284 -               __field(        __u64,  b_blocknr               )
1285 -               __field(        __u32,  b_size                  )
1286 -               __field(        __u32,  b_state                 )
1287 -               __field(        unsigned long,  first_page      )
1288 -               __field(        int,    io_done                 )
1289 -               __field(        int,    pages_written           )
1290 -               __field(        int,    sync_mode               )
1291 +               __field(      pgoff_t,  first_page              )
1292 +               __field(         long,  nr_to_write             )
1293 +               __field(          int,  sync_mode               )
1294         ),
1296         TP_fast_assign(
1297                 __entry->dev            = inode->i_sb->s_dev;
1298                 __entry->ino            = inode->i_ino;
1299 -               __entry->b_blocknr      = mpd->b_blocknr;
1300 -               __entry->b_size         = mpd->b_size;
1301 -               __entry->b_state        = mpd->b_state;
1302 -               __entry->first_page     = mpd->first_page;
1303 -               __entry->io_done        = mpd->io_done;
1304 -               __entry->pages_written  = mpd->pages_written;
1305 -               __entry->sync_mode      = mpd->wbc->sync_mode;
1306 +               __entry->first_page     = first_page;
1307 +               __entry->nr_to_write    = wbc->nr_to_write;
1308 +               __entry->sync_mode      = wbc->sync_mode;
1309         ),
1311 -       TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x "
1312 -                 "first_page %lu io_done %d pages_written %d sync_mode %d",
1313 +       TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
1314 +                 "sync_mode %d",
1315                   MAJOR(__entry->dev), MINOR(__entry->dev),
1316 -                 (unsigned long) __entry->ino,
1317 -                 __entry->b_blocknr, __entry->b_size,
1318 -                 __entry->b_state, __entry->first_page,
1319 -                 __entry->io_done, __entry->pages_written,
1320 -                 __entry->sync_mode
1321 -                  )
1322 +                 (unsigned long) __entry->ino, __entry->first_page,
1323 +                 __entry->nr_to_write, __entry->sync_mode)
1326 +TRACE_EVENT(ext4_da_write_pages_extent,
1327 +       TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
1329 +       TP_ARGS(inode, map),
1331 +       TP_STRUCT__entry(
1332 +               __field(        dev_t,  dev                     )
1333 +               __field(        ino_t,  ino                     )
1334 +               __field(        __u64,  lblk                    )
1335 +               __field(        __u32,  len                     )
1336 +               __field(        __u32,  flags                   )
1337 +       ),
1339 +       TP_fast_assign(
1340 +               __entry->dev            = inode->i_sb->s_dev;
1341 +               __entry->ino            = inode->i_ino;
1342 +               __entry->lblk           = map->m_lblk;
1343 +               __entry->len            = map->m_len;
1344 +               __entry->flags          = map->m_flags;
1345 +       ),
1347 +       TP_printk("dev %d,%d ino %lu lblk %llu len %u flags 0x%04x",
1348 +                 MAJOR(__entry->dev), MINOR(__entry->dev),
1349 +                 (unsigned long) __entry->ino, __entry->lblk, __entry->len,
1350 +                 __entry->flags)
1351  );
1353  TRACE_EVENT(ext4_da_writepages_result,