1 ext4: use io_end for multiple bios
3 From: Jan Kara <jack@suse.cz>
5 Change writeback path to create just one io_end structure for the
6 extent to which we submit IO and share it among bios writing that
7 extent. This prevents needless splitting and joining of unwritten
8 extents when they cannot be submitted as a single bio.
10 Bugs in ENOMEM handling found by Linux File System Verification project
11 (linuxtesting.org) and fixed by Alexey Khoroshilov
12 <khoroshilov@ispras.ru>.
14 CC: Alexey Khoroshilov <khoroshilov@ispras.ru>
15 Signed-off-by: Jan Kara <jack@suse.cz>
16 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
18 fs/ext4/ext4.h | 8 +++-
19 fs/ext4/inode.c | 98 ++++++++++++++++++++++++++-----------------
20 fs/ext4/page-io.c | 121 ++++++++++++++++++++++++++++++++++--------------------
21 3 files changed, 142 insertions(+), 85 deletions(-)
23 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
24 index 5aae3d1..0aabb34 100644
27 @@ -209,6 +209,7 @@ typedef struct ext4_io_end {
28 ssize_t size; /* size of the extent */
29 struct kiocb *iocb; /* iocb struct for AIO */
30 int result; /* error value for AIO */
31 + atomic_t count; /* reference counter */
34 struct ext4_io_submit {
35 @@ -2650,11 +2651,14 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
38 extern int __init ext4_init_pageio(void);
39 -extern void ext4_add_complete_io(ext4_io_end_t *io_end);
40 extern void ext4_exit_pageio(void);
41 extern void ext4_ioend_shutdown(struct inode *);
42 -extern void ext4_free_io_end(ext4_io_end_t *io);
43 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
44 +extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
45 +extern int ext4_put_io_end(ext4_io_end_t *io_end);
46 +extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
47 +extern void ext4_io_submit_init(struct ext4_io_submit *io,
48 + struct writeback_control *wbc);
49 extern void ext4_end_io_work(struct work_struct *work);
50 extern void ext4_io_submit(struct ext4_io_submit *io);
51 extern int ext4_bio_write_page(struct ext4_io_submit *io,
52 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
53 index d6382b8..7e60724 100644
56 @@ -1460,6 +1460,8 @@ static void ext4_da_page_release_reservation(struct page *page,
57 * Delayed allocation stuff
60 +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd);
63 * mpage_da_submit_io - walks through extent of pages and try to write
64 * them with writepage() call back
65 @@ -1488,7 +1490,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
66 struct ext4_io_submit io_submit;
68 BUG_ON(mpd->next_page <= mpd->first_page);
69 - memset(&io_submit, 0, sizeof(io_submit));
70 + ext4_io_submit_init(&io_submit, mpd->wbc);
71 + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
72 + if (!io_submit.io_end) {
73 + ext4_da_block_invalidatepages(mpd);
77 * We need to start from the first_page to the next_page - 1
78 * to make sure we also write the mapped dirty buffer_heads.
79 @@ -1576,6 +1583,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
80 pagevec_release(&pvec);
82 ext4_io_submit(&io_submit);
83 + /* Drop io_end reference we got from init */
84 + ext4_put_io_end_defer(io_submit.io_end);
88 @@ -2234,9 +2243,17 @@ static int ext4_writepage(struct page *page,
90 return __ext4_journalled_writepage(page, len);
92 - memset(&io_submit, 0, sizeof(io_submit));
93 + ext4_io_submit_init(&io_submit, wbc);
94 + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
95 + if (!io_submit.io_end) {
96 + redirty_page_for_writepage(wbc, page);
100 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
101 ext4_io_submit(&io_submit);
102 + /* Drop io_end reference we got from init */
103 + ext4_put_io_end_defer(io_submit.io_end);
107 @@ -3067,9 +3084,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
108 struct inode *inode = file_inode(iocb->ki_filp);
109 ext4_io_end_t *io_end = iocb->private;
111 - /* if not async direct IO or dio with 0 bytes write, just return */
112 - if (!io_end || !size)
114 + /* if not async direct IO just return */
116 + inode_dio_done(inode);
118 + aio_complete(iocb, ret, 0);
122 ext_debug("ext4_end_io_dio(): io_end 0x%p "
123 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
124 @@ -3077,25 +3098,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
127 iocb->private = NULL;
129 - /* if not aio dio with unwritten extents, just free io and return */
130 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
131 - ext4_free_io_end(io_end);
133 - inode_dio_done(inode);
135 - aio_complete(iocb, ret, 0);
139 io_end->offset = offset;
143 io_end->result = ret;
146 - ext4_add_complete_io(io_end);
147 + ext4_put_io_end_defer(io_end);
151 @@ -3129,6 +3138,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
152 get_block_t *get_block_func = NULL;
154 loff_t final_size = offset + count;
155 + ext4_io_end_t *io_end = NULL;
157 /* Use the old path for reads and writes beyond i_size. */
158 if (rw != WRITE || final_size > inode->i_size)
159 @@ -3167,13 +3177,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
160 iocb->private = NULL;
161 ext4_inode_aio_set(inode, NULL);
162 if (!is_sync_kiocb(iocb)) {
163 - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
164 + io_end = ext4_init_io_end(inode, GFP_NOFS);
169 io_end->flag |= EXT4_IO_END_DIRECT;
170 - iocb->private = io_end;
172 + * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
174 + iocb->private = ext4_get_io_end(io_end);
176 * we save the io structure for current async direct
177 * IO, so that later ext4_map_blocks() could flag the
178 @@ -3197,26 +3210,35 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
183 - ext4_inode_aio_set(inode, NULL);
185 - * The io_end structure takes a reference to the inode, that
186 - * structure needs to be destroyed and the reference to the
187 - * inode need to be dropped, when IO is complete, even with 0
188 - * byte write, or failed.
190 - * In the successful AIO DIO case, the io_end structure will
191 - * be destroyed and the reference to the inode will be dropped
192 - * after the end_io call back function is called.
194 - * In the case there is 0 byte write, or error case, since VFS
195 - * direct IO won't invoke the end_io call back function, we
196 - * need to free the end_io structure here.
197 + * Put our reference to io_end. This can free the io_end structure e.g.
198 + * in sync IO case or in case of error. It can even perform extent
199 + * conversion if all bios we submitted finished before we got here.
200 + * Note that in that case iocb->private can be already set to NULL
203 - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
204 - ext4_free_io_end(iocb->private);
205 - iocb->private = NULL;
206 - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
208 + ext4_inode_aio_set(inode, NULL);
209 + ext4_put_io_end(io_end);
211 + * When no IO was submitted ext4_end_io_dio() was not
212 + * called so we have to put iocb's reference.
214 + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
215 + WARN_ON(iocb->private != io_end);
216 + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
217 + WARN_ON(io_end->iocb);
219 + * Generic code already did inode_dio_done() so we
220 + * have to clear EXT4_IO_END_DIRECT to not do it for
224 + ext4_put_io_end(io_end);
225 + iocb->private = NULL;
228 + if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
229 EXT4_STATE_DIO_UNWRITTEN)) {
232 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
233 index 4acf1f7..19599bd 100644
234 --- a/fs/ext4/page-io.c
235 +++ b/fs/ext4/page-io.c
236 @@ -62,15 +62,28 @@ void ext4_ioend_shutdown(struct inode *inode)
237 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work);
240 -void ext4_free_io_end(ext4_io_end_t *io)
241 +static void ext4_release_io_end(ext4_io_end_t *io_end)
244 - BUG_ON(!list_empty(&io->list));
245 - BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
246 + BUG_ON(!list_empty(&io_end->list));
247 + BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
249 + if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
250 + wake_up_all(ext4_ioend_wq(io_end->inode));
251 + if (io_end->flag & EXT4_IO_END_DIRECT)
252 + inode_dio_done(io_end->inode);
254 + aio_complete(io_end->iocb, io_end->result, 0);
255 + kmem_cache_free(io_end_cachep, io_end);
258 +static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
260 + struct inode *inode = io_end->inode;
262 - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
263 - wake_up_all(ext4_ioend_wq(io->inode));
264 - kmem_cache_free(io_end_cachep, io);
265 + io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
266 + /* Wake up anyone waiting on unwritten extent conversion */
267 + if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
268 + wake_up_all(ext4_ioend_wq(inode));
271 /* check a range of space and convert unwritten extents to written. */
272 @@ -93,13 +106,8 @@ static int ext4_end_io(ext4_io_end_t *io)
273 "(inode %lu, offset %llu, size %zd, error %d)",
274 inode->i_ino, offset, size, ret);
276 - /* Wake up anyone waiting on unwritten extent conversion */
277 - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
278 - wake_up_all(ext4_ioend_wq(inode));
279 - if (io->flag & EXT4_IO_END_DIRECT)
280 - inode_dio_done(inode);
282 - aio_complete(io->iocb, io->result, 0);
283 + ext4_clear_io_unwritten_flag(io);
284 + ext4_release_io_end(io);
288 @@ -130,7 +138,7 @@ static void dump_completed_IO(struct inode *inode)
291 /* Add the io_end to per-inode completed end_io list. */
292 -void ext4_add_complete_io(ext4_io_end_t *io_end)
293 +static void ext4_add_complete_io(ext4_io_end_t *io_end)
295 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
296 struct workqueue_struct *wq;
297 @@ -167,8 +175,6 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
298 err = ext4_end_io(io);
299 if (unlikely(!ret && err))
301 - io->flag &= ~EXT4_IO_END_UNWRITTEN;
302 - ext4_free_io_end(io);
306 @@ -200,10 +206,43 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
307 atomic_inc(&EXT4_I(inode)->i_ioend_count);
309 INIT_LIST_HEAD(&io->list);
310 + atomic_set(&io->count, 1);
315 +void ext4_put_io_end_defer(ext4_io_end_t *io_end)
317 + if (atomic_dec_and_test(&io_end->count)) {
318 + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
319 + ext4_release_io_end(io_end);
322 + ext4_add_complete_io(io_end);
326 +int ext4_put_io_end(ext4_io_end_t *io_end)
330 + if (atomic_dec_and_test(&io_end->count)) {
331 + if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
332 + err = ext4_convert_unwritten_extents(io_end->inode,
333 + io_end->offset, io_end->size);
334 + ext4_clear_io_unwritten_flag(io_end);
336 + ext4_release_io_end(io_end);
341 +ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
343 + atomic_inc(&io_end->count);
348 * Print an buffer I/O error compatible with the fs/buffer.c. This
349 * provides compatibility with dmesg scrapers that look for a specific
350 @@ -286,12 +325,7 @@ static void ext4_end_bio(struct bio *bio, int error)
351 bi_sector >> (inode->i_blkbits - 9));
354 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
355 - ext4_free_io_end(io_end);
359 - ext4_add_complete_io(io_end);
360 + ext4_put_io_end_defer(io_end);
363 void ext4_io_submit(struct ext4_io_submit *io)
364 @@ -305,40 +339,37 @@ void ext4_io_submit(struct ext4_io_submit *io)
371 +void ext4_io_submit_init(struct ext4_io_submit *io,
372 + struct writeback_control *wbc)
374 + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
379 -static int io_submit_init(struct ext4_io_submit *io,
380 - struct inode *inode,
381 - struct writeback_control *wbc,
382 - struct buffer_head *bh)
383 +static int io_submit_init_bio(struct ext4_io_submit *io,
384 + struct buffer_head *bh)
386 - ext4_io_end_t *io_end;
387 - struct page *page = bh->b_page;
388 int nvecs = bio_get_nr_vecs(bh->b_bdev);
391 - io_end = ext4_init_io_end(inode, GFP_NOFS);
394 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
395 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
396 bio->bi_bdev = bh->b_bdev;
397 - bio->bi_private = io->io_end = io_end;
398 bio->bi_end_io = ext4_end_bio;
400 - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
402 + bio->bi_private = ext4_get_io_end(io->io_end);
403 + if (!io->io_end->size)
404 + io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
407 - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
408 io->io_next_block = bh->b_blocknr;
412 static int io_submit_add_bh(struct ext4_io_submit *io,
414 - struct writeback_control *wbc,
415 struct buffer_head *bh)
417 ext4_io_end_t *io_end;
418 @@ -349,18 +380,18 @@ submit_and_retry:
421 if (io->io_bio == NULL) {
422 - ret = io_submit_init(io, inode, wbc, bh);
423 + ret = io_submit_init_bio(io, bh);
427 + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
428 + if (ret != bh->b_size)
429 + goto submit_and_retry;
431 if (test_clear_buffer_uninit(bh))
432 ext4_set_io_unwritten_flag(inode, io_end);
433 - io->io_end->size += bh->b_size;
434 + io_end->size += bh->b_size;
436 - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
437 - if (ret != bh->b_size)
438 - goto submit_and_retry;
442 @@ -432,7 +463,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
444 if (!buffer_async_write(bh))
446 - ret = io_submit_add_bh(io, inode, wbc, bh);
447 + ret = io_submit_add_bh(io, inode, bh);
450 * We only get here on ENOMEM. Not much else