1 ext4: simplify io_end handling for AIO DIO
3 From: Jan Kara <jack@suse.cz>
5 When mapping blocks for direct IO, we allocate io_end structure before
6 mapping blocks and store pointer to it in the inode. This creates a
7 requirement that any AIO DIO using io_end must be protected by i_mutex.
8 This created problems in the past with dioread_nolock mode which was
9 corrupting io_end pointers. Also io_end is allocated unnecessarily in
10 case where we don't need to convert any extents (which is a common case
11 for example when overwriting file).
13 We fix the problem by allocating io_end only once we return unwritten
14 extent from block mapping function for AIO DIO (so we can save some
15 pointless io_end allocations) and we pass pointer to it in bh->b_private
16 which generic DIO code later passes to our end IO callback. That way we
17 remove any need for global pointer to io_end structure and thus fix the
20 The downside of this change is that the checking for unwritten IO in
21 flight in ext4_extents_can_be_merged() is more racy since we now
22 increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after dropping
23 i_data_sem. However the check has been racy already before because
24 ext4_writepages() already increment i_unwritten after dropping
25 i_data_sem and reserved blocks save us from hitting ENOSPC in the worst
28 Signed-off-by: Jan Kara <jack@suse.cz>
30 fs/ext4/ext4.h | 10 ----
31 fs/ext4/extents.c | 35 +++-----------
32 fs/ext4/inode.c | 133 ++++++++++++++++++++++++++++--------------------------
33 3 files changed, 74 insertions(+), 104 deletions(-)
35 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
36 index 80cc3914b6ad..29d51505a9d2 100644
39 @@ -1513,16 +1513,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
43 -static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
45 - return inode->i_private;
48 -static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
50 - inode->i_private = io;
54 * Inode dynamic state flags
56 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
57 index 0ffabaf90aa5..1742e60e229c 100644
58 --- a/fs/ext4/extents.c
59 +++ b/fs/ext4/extents.c
60 @@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
62 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
65 + * The check for IO to unwritten extent is somewhat racy as we
66 + * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
67 + * dropping i_data_sem. But reserved blocks should save us in that
70 if (ext4_ext_is_unwritten(ex1) &&
71 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
72 atomic_read(&EXT4_I(inode)->i_unwritten) ||
73 @@ -4007,7 +4013,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
74 struct ext4_ext_path *path = *ppath;
77 - ext4_io_end_t *io = ext4_inode_aio(inode);
79 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
80 "block %llu, max_blocks %u, flags %x, allocated %u\n",
81 @@ -4030,15 +4035,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
82 flags | EXT4_GET_BLOCKS_CONVERT);
86 - * Flag the inode(non aio case) or end_io struct (aio case)
87 - * that this IO needs to conversion to written when IO is
91 - ext4_set_io_unwritten_flag(inode, io);
93 - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
94 map->m_flags |= EXT4_MAP_UNWRITTEN;
97 @@ -4283,9 +4279,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
98 unsigned int allocated = 0, offset = 0;
99 unsigned int allocated_clusters = 0;
100 struct ext4_allocation_request ar;
101 - ext4_io_end_t *io = ext4_inode_aio(inode);
102 ext4_lblk_t cluster_offset;
103 - int set_unwritten = 0;
104 bool map_from_cluster = false;
106 ext_debug("blocks %u/%u requested for inode %lu\n",
107 @@ -4482,15 +4476,6 @@ got_allocated_blocks:
108 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
109 ext4_ext_mark_unwritten(&newex);
110 map->m_flags |= EXT4_MAP_UNWRITTEN;
112 - * io_end structure was created for every IO write to an
113 - * unwritten extent. To avoid unnecessary conversion,
114 - * here we flag the IO that really needs the conversion.
115 - * For non asycn direct IO case, flag the inode state
116 - * that we need to perform conversion when IO is done.
118 - if (flags & EXT4_GET_BLOCKS_PRE_IO)
123 @@ -4501,14 +4486,6 @@ got_allocated_blocks:
124 err = ext4_ext_insert_extent(handle, inode, &path,
127 - if (!err && set_unwritten) {
129 - ext4_set_io_unwritten_flag(inode, io);
131 - ext4_set_inode_state(inode,
132 - EXT4_STATE_DIO_UNWRITTEN);
135 if (err && free_on_err) {
136 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
137 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
138 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
139 index 506180729faf..ef411c4f11f8 100644
140 --- a/fs/ext4/inode.c
141 +++ b/fs/ext4/inode.c
142 @@ -768,18 +768,16 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock,
146 - * Get block function for DIO writes when we create unwritten extent if
147 + * Get block function for AIO DIO writes when we create unwritten extent if
148 * blocks are not allocated yet. The extent will be converted to written
149 * after IO is complete.
151 -static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock,
152 - struct buffer_head *bh_result, int create)
153 +static int ext4_dio_get_block_unwritten_async(struct inode *inode,
154 + sector_t iblock, struct buffer_head *bh_result, int create)
159 - ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n",
160 - inode->i_ino, create);
161 /* We don't expect handle for direct IO */
162 WARN_ON_ONCE(ext4_journal_current_handle());
164 @@ -789,16 +787,62 @@ static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock,
165 ret = _ext4_get_block(inode, iblock, bh_result,
166 EXT4_GET_BLOCKS_IO_CREATE_EXT);
167 ext4_journal_stop(handle);
168 - if (!ret && buffer_unwritten(bh_result)) {
169 - ext4_io_end_t *io_end = ext4_inode_aio(inode);
172 + * When doing DIO using unwritten extents, we need io_end to convert
173 + * unwritten extents to written on IO completion. We allocate io_end
174 + * once we spot unwritten extent and store it in b_private. Generic
175 + * DIO code keeps b_private set and furthermore passes the value to
176 + * our completion callback in 'private' argument.
178 + if (!ret && buffer_unwritten(bh_result)) {
179 + if (!bh_result->b_private) {
180 + ext4_io_end_t *io_end;
182 + io_end = ext4_init_io_end(inode, GFP_KERNEL);
185 + bh_result->b_private = io_end;
186 + ext4_set_io_unwritten_flag(inode, io_end);
188 set_buffer_defer_completion(bh_result);
189 - WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN));
196 + * Get block function for non-AIO DIO writes when we create unwritten extent if
197 + * blocks are not allocated yet. The extent will be converted to written
198 + * after IO is complete from ext4_ext_direct_IO() function.
200 +static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
201 + sector_t iblock, struct buffer_head *bh_result, int create)
206 + /* We don't expect handle for direct IO */
207 + WARN_ON_ONCE(ext4_journal_current_handle());
209 + handle = start_dio_trans(inode, bh_result);
210 + if (IS_ERR(handle))
211 + return PTR_ERR(handle);
212 + ret = _ext4_get_block(inode, iblock, bh_result,
213 + EXT4_GET_BLOCKS_IO_CREATE_EXT);
214 + ext4_journal_stop(handle);
217 + * Mark inode as having pending DIO writes to unwritten extents.
218 + * ext4_ext_direct_IO() checks this flag and converts extents to
221 + if (!ret && buffer_unwritten(bh_result))
222 + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
227 static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
228 struct buffer_head *bh_result, int create)
230 @@ -3214,7 +3258,7 @@ out:
231 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
232 ssize_t size, void *private)
234 - ext4_io_end_t *io_end = iocb->private;
235 + ext4_io_end_t *io_end = private;
237 /* if not async direct IO just return */
239 @@ -3222,10 +3266,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
241 ext_debug("ext4_end_io_dio(): io_end 0x%p "
242 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
243 - iocb->private, io_end->inode->i_ino, iocb, offset,
245 + io_end, io_end->inode->i_ino, iocb, offset, size);
247 - iocb->private = NULL;
248 io_end->offset = offset;
250 ext4_put_io_end(io_end);
251 @@ -3261,7 +3303,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
252 get_block_t *get_block_func = NULL;
254 loff_t final_size = offset + count;
255 - ext4_io_end_t *io_end = NULL;
257 /* Use the old path for reads and writes beyond i_size. */
258 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
259 @@ -3286,16 +3327,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
261 * We could direct write to holes and fallocate.
263 - * Allocated blocks to fill the hole are marked as
264 - * unwritten to prevent parallel buffered read to expose
265 - * the stale data before DIO complete the data IO.
266 + * Allocated blocks to fill the hole are marked as unwritten to prevent
267 + * parallel buffered read to expose the stale data before DIO complete
270 - * As to previously fallocated extents, ext4 get_block will
271 - * just simply mark the buffer mapped but still keep the
272 - * extents unwritten.
273 + * As to previously fallocated extents, ext4 get_block will just simply
274 + * mark the buffer mapped but still keep the extents unwritten.
276 - * For non AIO case, we will convert those unwritten extents
277 - * to written after return back from blockdev_direct_IO.
278 + * For non AIO case, we will convert those unwritten extents to written
279 + * after return back from blockdev_direct_IO. That way we save us from
280 + * allocating io_end structure and also the overhead of offloading
281 + * the extent convertion to a workqueue.
283 * For async DIO, the conversion needs to be deferred when the
284 * IO is completed. The ext4 end_io callback function will be
285 @@ -3303,30 +3345,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
286 * case, we allocate an io_end structure to hook to the iocb.
288 iocb->private = NULL;
291 get_block_func = ext4_dio_get_block_overwrite;
292 + else if (is_sync_kiocb(iocb)) {
293 + get_block_func = ext4_dio_get_block_unwritten_sync;
294 + dio_flags = DIO_LOCKING;
296 - ext4_inode_aio_set(inode, NULL);
297 - if (!is_sync_kiocb(iocb)) {
298 - io_end = ext4_init_io_end(inode, GFP_NOFS);
304 - * Grab reference for DIO. Will be dropped in
305 - * ext4_end_io_dio()
307 - iocb->private = ext4_get_io_end(io_end);
309 - * we save the io structure for current async direct
310 - * IO, so that later ext4_map_blocks() could flag the
311 - * io structure whether there is a unwritten extents
312 - * needs to be converted when IO is completed.
314 - ext4_inode_aio_set(inode, io_end);
316 - get_block_func = ext4_dio_get_block_unwritten;
317 + get_block_func = ext4_dio_get_block_unwritten_async;
318 dio_flags = DIO_LOCKING;
320 #ifdef CONFIG_EXT4_FS_ENCRYPTION
321 @@ -3341,27 +3366,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
323 ext4_end_io_dio, NULL, dio_flags);
326 - * Put our reference to io_end. This can free the io_end structure e.g.
327 - * in sync IO case or in case of error. It can even perform extent
328 - * conversion if all bios we submitted finished before we got here.
329 - * Note that in that case iocb->private can be already set to NULL
333 - ext4_inode_aio_set(inode, NULL);
334 - ext4_put_io_end(io_end);
336 - * When no IO was submitted ext4_end_io_dio() was not
337 - * called so we have to put iocb's reference.
339 - if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
340 - WARN_ON(iocb->private != io_end);
341 - WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
342 - ext4_put_io_end(io_end);
343 - iocb->private = NULL;
346 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
347 EXT4_STATE_DIO_UNWRITTEN)) {
349 @@ -3376,7 +3380,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
350 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
354 if (iov_iter_rw(iter) == WRITE)
355 inode_dio_end(inode);
356 /* take i_mutex locking again if we do a ovewrite dio */
361 To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
362 the body of a message to majordomo@vger.kernel.org
363 More majordomo info at http://vger.kernel.org/majordomo-info.html