1 ext4: refactor direct IO code
3 From: Jan Kara <jack@suse.cz>
5 Currently ext4 direct IO handling is split between ext4_ext_direct_IO()
6 and ext4_ind_direct_IO(). However the extent based function calls into
7 the indirect based one for some cases and for example it is not able to
8 handle file extending. Previously it was not also properly handling
9 retries in case of ENOSPC errors. With DAX things would get even more
10 contrieved so just refactor the direct IO code and instead of indirect /
11 extent split do the split to read vs writes.
13 Signed-off-by: Jan Kara <jack@suse.cz>
14 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
17 fs/ext4/indirect.c | 127 ---------------------------------------------------
18 fs/ext4/inode.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++-------
19 3 files changed, 114 insertions(+), 146 deletions(-)
21 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
22 index 349afebe21ee..35792b430fb6 100644
25 @@ -2581,8 +2581,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
27 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
28 struct ext4_map_blocks *map, int flags);
29 -extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
31 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
32 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
33 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
34 diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
35 index 3027fa681de5..bc15c2c17633 100644
36 --- a/fs/ext4/indirect.c
37 +++ b/fs/ext4/indirect.c
38 @@ -649,133 +649,6 @@ out:
42 - * O_DIRECT for ext3 (or indirect map) based files
44 - * If the O_DIRECT write will extend the file then add this inode to the
45 - * orphan list. So recovery will truncate it back to the original size
46 - * if the machine crashes during the write.
48 - * If the O_DIRECT write is intantiating holes inside i_size and the machine
49 - * crashes then stale disk data _may_ be exposed inside the file. But current
50 - * VFS code falls back into buffered path in that case so we are safe.
52 -ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
55 - struct file *file = iocb->ki_filp;
56 - struct inode *inode = file->f_mapping->host;
57 - struct ext4_inode_info *ei = EXT4_I(inode);
61 - size_t count = iov_iter_count(iter);
64 - if (iov_iter_rw(iter) == WRITE) {
65 - loff_t final_size = offset + count;
67 - if (final_size > inode->i_size) {
68 - /* Credits for sb + inode write */
69 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
70 - if (IS_ERR(handle)) {
71 - ret = PTR_ERR(handle);
74 - ret = ext4_orphan_add(handle, inode);
76 - ext4_journal_stop(handle);
80 - ei->i_disksize = inode->i_size;
81 - ext4_journal_stop(handle);
86 - if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
88 - * Nolock dioread optimization may be dynamically disabled
89 - * via ext4_inode_block_unlocked_dio(). Check inode's state
90 - * while holding extra i_dio_count ref.
92 - inode_dio_begin(inode);
94 - if (unlikely(ext4_test_inode_state(inode,
95 - EXT4_STATE_DIOREAD_LOCK))) {
96 - inode_dio_end(inode);
100 - ret = dax_do_io(iocb, inode, iter, offset,
101 - ext4_dio_get_block, NULL, 0);
103 - ret = __blockdev_direct_IO(iocb, inode,
104 - inode->i_sb->s_bdev, iter,
105 - offset, ext4_dio_get_block,
107 - inode_dio_end(inode);
111 - ret = dax_do_io(iocb, inode, iter, offset,
112 - ext4_dio_get_block, NULL, DIO_LOCKING);
114 - ret = blockdev_direct_IO(iocb, inode, iter, offset,
115 - ext4_dio_get_block);
117 - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
118 - loff_t isize = i_size_read(inode);
119 - loff_t end = offset + count;
122 - ext4_truncate_failed_write(inode);
125 - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
131 - /* Credits for sb + inode write */
132 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
133 - if (IS_ERR(handle)) {
134 - /* This is really bad luck. We've written the data
135 - * but cannot extend i_size. Bail out and pretend
136 - * the write failed... */
137 - ret = PTR_ERR(handle);
138 - if (inode->i_nlink)
139 - ext4_orphan_del(NULL, inode);
143 - if (inode->i_nlink)
144 - ext4_orphan_del(handle, inode);
146 - loff_t end = offset + ret;
147 - if (end > inode->i_size) {
148 - ei->i_disksize = end;
149 - i_size_write(inode, end);
151 - * We're going to return a positive `ret'
152 - * here due to non-zero-length I/O, so there's
153 - * no way of reporting error returns from
154 - * ext4_mark_inode_dirty() to userspace. So
157 - ext4_mark_inode_dirty(handle, inode);
160 - err = ext4_journal_stop(handle);
169 * Calculate the number of metadata blocks need to reserve
170 * to allocate a new block at @lblocks for non extent file based file
172 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
173 index 3e0c06028668..23fd0e0a9223 100644
174 --- a/fs/ext4/inode.c
175 +++ b/fs/ext4/inode.c
176 @@ -3281,7 +3281,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
180 - * For ext4 extent files, ext4 will do direct-io write to holes,
181 + * Handling of direct IO writes.
183 + * For ext4 extent files, ext4 will do direct-io write even to holes,
184 * preallocated extents, and those write extend the file, no need to
185 * fall back to buffered IO.
187 @@ -3299,21 +3301,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
188 * if the machine crashes during the write.
191 -static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
193 +static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
196 struct file *file = iocb->ki_filp;
197 struct inode *inode = file->f_mapping->host;
198 + struct ext4_inode_info *ei = EXT4_I(inode);
200 size_t count = iov_iter_count(iter);
202 get_block_t *get_block_func = NULL;
204 loff_t final_size = offset + count;
208 - /* Use the old path for reads and writes beyond i_size. */
209 - if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
210 - return ext4_ind_direct_IO(iocb, iter, offset);
211 + if (final_size > inode->i_size) {
212 + /* Credits for sb + inode write */
213 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
214 + if (IS_ERR(handle)) {
215 + ret = PTR_ERR(handle);
218 + ret = ext4_orphan_add(handle, inode);
220 + ext4_journal_stop(handle);
224 + ei->i_disksize = inode->i_size;
225 + ext4_journal_stop(handle);
228 BUG_ON(iocb->private == NULL);
230 @@ -3322,8 +3340,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
231 * conversion. This also disallows race between truncate() and
232 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
234 - if (iov_iter_rw(iter) == WRITE)
235 - inode_dio_begin(inode);
236 + inode_dio_begin(inode);
238 /* If we do a overwrite dio, i_mutex locking can be released */
239 overwrite = *((int *)iocb->private);
240 @@ -3332,7 +3349,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
244 - * We could direct write to holes and fallocate.
245 + * For extent mapped files we could direct write to holes and fallocate.
247 * Allocated blocks to fill the hole are marked as unwritten to prevent
248 * parallel buffered read to expose the stale data before DIO complete
249 @@ -3354,7 +3371,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
250 iocb->private = NULL;
252 get_block_func = ext4_dio_get_block_overwrite;
253 - else if (is_sync_kiocb(iocb)) {
254 + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
255 + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
256 + get_block_func = ext4_dio_get_block;
257 + dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
258 + } else if (is_sync_kiocb(iocb)) {
259 get_block_func = ext4_dio_get_block_unwritten_sync;
260 dio_flags = DIO_LOCKING;
262 @@ -3364,10 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
263 #ifdef CONFIG_EXT4_FS_ENCRYPTION
264 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
267 + if (IS_DAX(inode)) {
268 + dio_flags &= ~DIO_SKIP_HOLES;
269 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
270 ext4_end_io_dio, dio_flags);
273 ret = __blockdev_direct_IO(iocb, inode,
274 inode->i_sb->s_bdev, iter, offset,
276 @@ -3387,12 +3409,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
277 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
280 - if (iov_iter_rw(iter) == WRITE)
281 - inode_dio_end(inode);
282 + inode_dio_end(inode);
283 /* take i_mutex locking again if we do a ovewrite dio */
287 + if (ret < 0 && final_size > inode->i_size)
288 + ext4_truncate_failed_write(inode);
290 + /* Handle extending of i_size after direct IO write */
294 + /* Credits for sb + inode write */
295 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
296 + if (IS_ERR(handle)) {
297 + /* This is really bad luck. We've written the data
298 + * but cannot extend i_size. Bail out and pretend
299 + * the write failed... */
300 + ret = PTR_ERR(handle);
301 + if (inode->i_nlink)
302 + ext4_orphan_del(NULL, inode);
306 + if (inode->i_nlink)
307 + ext4_orphan_del(handle, inode);
309 + loff_t end = offset + ret;
310 + if (end > inode->i_size) {
311 + ei->i_disksize = end;
312 + i_size_write(inode, end);
314 + * We're going to return a positive `ret'
315 + * here due to non-zero-length I/O, so there's
316 + * no way of reporting error returns from
317 + * ext4_mark_inode_dirty() to userspace. So
320 + ext4_mark_inode_dirty(handle, inode);
323 + err = ext4_journal_stop(handle);
331 +static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
335 + struct inode *inode = iocb->ki_filp->f_mapping->host;
338 + if (ext4_should_dioread_nolock(inode)) {
340 + * Nolock dioread optimization may be dynamically disabled
341 + * via ext4_inode_block_unlocked_dio(). Check inode's state
342 + * while holding extra i_dio_count ref.
344 + inode_dio_begin(inode);
346 + if (unlikely(ext4_test_inode_state(inode,
347 + EXT4_STATE_DIOREAD_LOCK)))
348 + inode_dio_end(inode);
352 + if (IS_DAX(inode)) {
353 + ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
354 + NULL, unlocked ? 0 : DIO_LOCKING);
356 + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
357 + iter, offset, ext4_dio_get_block,
359 + unlocked ? 0 : DIO_LOCKING);
362 + inode_dio_end(inode);
366 @@ -3420,10 +3517,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
369 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
370 - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
371 - ret = ext4_ext_direct_IO(iocb, iter, offset);
372 + if (iov_iter_rw(iter) == READ)
373 + ret = ext4_direct_IO_read(iocb, iter, offset);
375 - ret = ext4_ind_direct_IO(iocb, iter, offset);
376 + ret = ext4_direct_IO_write(iocb, iter, offset);
377 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);