add patch call-get_blocks-with-create-1-for-write-faults-to-unwritten-extents
[ext4-patch-queue.git] / refactor-direct-IO-code
blob138e34fb349708f233754b053842836f216b542e
1 ext4: refactor direct IO code
3 From: Jan Kara <jack@suse.cz>
5 Currently ext4 direct IO handling is split between ext4_ext_direct_IO()
6 and ext4_ind_direct_IO(). However the extent based function calls into
7 the indirect based one for some cases and for example it is not able to
8 handle file extending. Previously it was not also properly handling
9 retries in case of ENOSPC errors. With DAX things would get even more
10 contrieved so just refactor the direct IO code and instead of indirect /
11 extent split do the split to read vs writes.
13 Signed-off-by: Jan Kara <jack@suse.cz>
14 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
15 ---
16  fs/ext4/ext4.h     |   2 -
17  fs/ext4/indirect.c | 127 ---------------------------------------------------
18  fs/ext4/inode.c    | 131 ++++++++++++++++++++++++++++++++++++++++++++++-------
19  3 files changed, 114 insertions(+), 146 deletions(-)
21 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
22 index 349afebe21ee..35792b430fb6 100644
23 --- a/fs/ext4/ext4.h
24 +++ b/fs/ext4/ext4.h
25 @@ -2581,8 +2581,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
26  /* indirect.c */
27  extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
28                                 struct ext4_map_blocks *map, int flags);
29 -extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
30 -                                 loff_t offset);
31  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
32  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
33  extern void ext4_ind_truncate(handle_t *, struct inode *inode);
34 diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
35 index 3027fa681de5..bc15c2c17633 100644
36 --- a/fs/ext4/indirect.c
37 +++ b/fs/ext4/indirect.c
38 @@ -649,133 +649,6 @@ out:
39  }
41  /*
42 - * O_DIRECT for ext3 (or indirect map) based files
43 - *
44 - * If the O_DIRECT write will extend the file then add this inode to the
45 - * orphan list.  So recovery will truncate it back to the original size
46 - * if the machine crashes during the write.
47 - *
48 - * If the O_DIRECT write is intantiating holes inside i_size and the machine
49 - * crashes then stale disk data _may_ be exposed inside the file. But current
50 - * VFS code falls back into buffered path in that case so we are safe.
51 - */
52 -ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
53 -                          loff_t offset)
55 -       struct file *file = iocb->ki_filp;
56 -       struct inode *inode = file->f_mapping->host;
57 -       struct ext4_inode_info *ei = EXT4_I(inode);
58 -       handle_t *handle;
59 -       ssize_t ret;
60 -       int orphan = 0;
61 -       size_t count = iov_iter_count(iter);
62 -       int retries = 0;
64 -       if (iov_iter_rw(iter) == WRITE) {
65 -               loff_t final_size = offset + count;
67 -               if (final_size > inode->i_size) {
68 -                       /* Credits for sb + inode write */
69 -                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
70 -                       if (IS_ERR(handle)) {
71 -                               ret = PTR_ERR(handle);
72 -                               goto out;
73 -                       }
74 -                       ret = ext4_orphan_add(handle, inode);
75 -                       if (ret) {
76 -                               ext4_journal_stop(handle);
77 -                               goto out;
78 -                       }
79 -                       orphan = 1;
80 -                       ei->i_disksize = inode->i_size;
81 -                       ext4_journal_stop(handle);
82 -               }
83 -       }
85 -retry:
86 -       if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
87 -               /*
88 -                * Nolock dioread optimization may be dynamically disabled
89 -                * via ext4_inode_block_unlocked_dio(). Check inode's state
90 -                * while holding extra i_dio_count ref.
91 -                */
92 -               inode_dio_begin(inode);
93 -               smp_mb();
94 -               if (unlikely(ext4_test_inode_state(inode,
95 -                                                   EXT4_STATE_DIOREAD_LOCK))) {
96 -                       inode_dio_end(inode);
97 -                       goto locked;
98 -               }
99 -               if (IS_DAX(inode))
100 -                       ret = dax_do_io(iocb, inode, iter, offset,
101 -                                       ext4_dio_get_block, NULL, 0);
102 -               else
103 -                       ret = __blockdev_direct_IO(iocb, inode,
104 -                                                  inode->i_sb->s_bdev, iter,
105 -                                                  offset, ext4_dio_get_block,
106 -                                                  NULL, NULL, 0);
107 -               inode_dio_end(inode);
108 -       } else {
109 -locked:
110 -               if (IS_DAX(inode))
111 -                       ret = dax_do_io(iocb, inode, iter, offset,
112 -                                       ext4_dio_get_block, NULL, DIO_LOCKING);
113 -               else
114 -                       ret = blockdev_direct_IO(iocb, inode, iter, offset,
115 -                                                ext4_dio_get_block);
117 -               if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
118 -                       loff_t isize = i_size_read(inode);
119 -                       loff_t end = offset + count;
121 -                       if (end > isize)
122 -                               ext4_truncate_failed_write(inode);
123 -               }
124 -       }
125 -       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
126 -               goto retry;
128 -       if (orphan) {
129 -               int err;
131 -               /* Credits for sb + inode write */
132 -               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
133 -               if (IS_ERR(handle)) {
134 -                       /* This is really bad luck. We've written the data
135 -                        * but cannot extend i_size. Bail out and pretend
136 -                        * the write failed... */
137 -                       ret = PTR_ERR(handle);
138 -                       if (inode->i_nlink)
139 -                               ext4_orphan_del(NULL, inode);
141 -                       goto out;
142 -               }
143 -               if (inode->i_nlink)
144 -                       ext4_orphan_del(handle, inode);
145 -               if (ret > 0) {
146 -                       loff_t end = offset + ret;
147 -                       if (end > inode->i_size) {
148 -                               ei->i_disksize = end;
149 -                               i_size_write(inode, end);
150 -                               /*
151 -                                * We're going to return a positive `ret'
152 -                                * here due to non-zero-length I/O, so there's
153 -                                * no way of reporting error returns from
154 -                                * ext4_mark_inode_dirty() to userspace.  So
155 -                                * ignore it.
156 -                                */
157 -                               ext4_mark_inode_dirty(handle, inode);
158 -                       }
159 -               }
160 -               err = ext4_journal_stop(handle);
161 -               if (ret == 0)
162 -                       ret = err;
163 -       }
164 -out:
165 -       return ret;
169   * Calculate the number of metadata blocks need to reserve
170   * to allocate a new block at @lblocks for non extent file based file
171   */
172 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
173 index 3e0c06028668..23fd0e0a9223 100644
174 --- a/fs/ext4/inode.c
175 +++ b/fs/ext4/inode.c
176 @@ -3281,7 +3281,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
179  /*
180 - * For ext4 extent files, ext4 will do direct-io write to holes,
181 + * Handling of direct IO writes.
182 + *
183 + * For ext4 extent files, ext4 will do direct-io write even to holes,
184   * preallocated extents, and those write extend the file, no need to
185   * fall back to buffered IO.
186   *
187 @@ -3299,21 +3301,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
188   * if the machine crashes during the write.
189   *
190   */
191 -static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
192 -                                 loff_t offset)
193 +static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter,
194 +                                   loff_t offset)
196         struct file *file = iocb->ki_filp;
197         struct inode *inode = file->f_mapping->host;
198 +       struct ext4_inode_info *ei = EXT4_I(inode);
199         ssize_t ret;
200         size_t count = iov_iter_count(iter);
201         int overwrite = 0;
202         get_block_t *get_block_func = NULL;
203         int dio_flags = 0;
204         loff_t final_size = offset + count;
205 +       int orphan = 0;
206 +       handle_t *handle;
208 -       /* Use the old path for reads and writes beyond i_size. */
209 -       if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
210 -               return ext4_ind_direct_IO(iocb, iter, offset);
211 +       if (final_size > inode->i_size) {
212 +               /* Credits for sb + inode write */
213 +               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
214 +               if (IS_ERR(handle)) {
215 +                       ret = PTR_ERR(handle);
216 +                       goto out;
217 +               }
218 +               ret = ext4_orphan_add(handle, inode);
219 +               if (ret) {
220 +                       ext4_journal_stop(handle);
221 +                       goto out;
222 +               }
223 +               orphan = 1;
224 +               ei->i_disksize = inode->i_size;
225 +               ext4_journal_stop(handle);
226 +       }
228         BUG_ON(iocb->private == NULL);
230 @@ -3322,8 +3340,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
231          * conversion. This also disallows race between truncate() and
232          * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
233          */
234 -       if (iov_iter_rw(iter) == WRITE)
235 -               inode_dio_begin(inode);
236 +       inode_dio_begin(inode);
238         /* If we do a overwrite dio, i_mutex locking can be released */
239         overwrite = *((int *)iocb->private);
240 @@ -3332,7 +3349,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
241                 inode_unlock(inode);
243         /*
244 -        * We could direct write to holes and fallocate.
245 +        * For extent mapped files we could direct write to holes and fallocate.
246          *
247          * Allocated blocks to fill the hole are marked as unwritten to prevent
248          * parallel buffered read to expose the stale data before DIO complete
249 @@ -3354,7 +3371,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
250         iocb->private = NULL;
251         if (overwrite)
252                 get_block_func = ext4_dio_get_block_overwrite;
253 -       else if (is_sync_kiocb(iocb)) {
254 +       else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
255 +                round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
256 +               get_block_func = ext4_dio_get_block;
257 +               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
258 +       } else if (is_sync_kiocb(iocb)) {
259                 get_block_func = ext4_dio_get_block_unwritten_sync;
260                 dio_flags = DIO_LOCKING;
261         } else {
262 @@ -3364,10 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
263  #ifdef CONFIG_EXT4_FS_ENCRYPTION
264         BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
265  #endif
266 -       if (IS_DAX(inode))
267 +       if (IS_DAX(inode)) {
268 +               dio_flags &= ~DIO_SKIP_HOLES;
269                 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
270                                 ext4_end_io_dio, dio_flags);
271 -       else
272 +       } else
273                 ret = __blockdev_direct_IO(iocb, inode,
274                                            inode->i_sb->s_bdev, iter, offset,
275                                            get_block_func,
276 @@ -3387,12 +3409,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
277                 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
278         }
280 -       if (iov_iter_rw(iter) == WRITE)
281 -               inode_dio_end(inode);
282 +       inode_dio_end(inode);
283         /* take i_mutex locking again if we do a ovewrite dio */
284         if (overwrite)
285                 inode_lock(inode);
287 +       if (ret < 0 && final_size > inode->i_size)
288 +               ext4_truncate_failed_write(inode);
290 +       /* Handle extending of i_size after direct IO write */
291 +       if (orphan) {
292 +               int err;
294 +               /* Credits for sb + inode write */
295 +               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
296 +               if (IS_ERR(handle)) {
297 +                       /* This is really bad luck. We've written the data
298 +                        * but cannot extend i_size. Bail out and pretend
299 +                        * the write failed... */
300 +                       ret = PTR_ERR(handle);
301 +                       if (inode->i_nlink)
302 +                               ext4_orphan_del(NULL, inode);
304 +                       goto out;
305 +               }
306 +               if (inode->i_nlink)
307 +                       ext4_orphan_del(handle, inode);
308 +               if (ret > 0) {
309 +                       loff_t end = offset + ret;
310 +                       if (end > inode->i_size) {
311 +                               ei->i_disksize = end;
312 +                               i_size_write(inode, end);
313 +                               /*
314 +                                * We're going to return a positive `ret'
315 +                                * here due to non-zero-length I/O, so there's
316 +                                * no way of reporting error returns from
317 +                                * ext4_mark_inode_dirty() to userspace.  So
318 +                                * ignore it.
319 +                                */
320 +                               ext4_mark_inode_dirty(handle, inode);
321 +                       }
322 +               }
323 +               err = ext4_journal_stop(handle);
324 +               if (ret == 0)
325 +                       ret = err;
326 +       }
327 +out:
328 +       return ret;
331 +static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter,
332 +                                  loff_t offset)
334 +       int unlocked = 0;
335 +       struct inode *inode = iocb->ki_filp->f_mapping->host;
336 +       ssize_t ret;
338 +       if (ext4_should_dioread_nolock(inode)) {
339 +               /*
340 +                * Nolock dioread optimization may be dynamically disabled
341 +                * via ext4_inode_block_unlocked_dio(). Check inode's state
342 +                * while holding extra i_dio_count ref.
343 +                */
344 +               inode_dio_begin(inode);
345 +               smp_mb();
346 +               if (unlikely(ext4_test_inode_state(inode,
347 +                                                   EXT4_STATE_DIOREAD_LOCK)))
348 +                       inode_dio_end(inode);
349 +               else
350 +                       unlocked = 1;
351 +       }
352 +       if (IS_DAX(inode)) {
353 +               ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block,
354 +                               NULL, unlocked ? 0 : DIO_LOCKING);
355 +       } else {
356 +               ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
357 +                                          iter, offset, ext4_dio_get_block,
358 +                                          NULL, NULL,
359 +                                          unlocked ? 0 : DIO_LOCKING);
360 +       }
361 +       if (unlocked)
362 +               inode_dio_end(inode);
363         return ret;
366 @@ -3420,10 +3517,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
367                 return 0;
369         trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
370 -       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
371 -               ret = ext4_ext_direct_IO(iocb, iter, offset);
372 +       if (iov_iter_rw(iter) == READ)
373 +               ret = ext4_direct_IO_read(iocb, iter, offset);
374         else
375 -               ret = ext4_ind_direct_IO(iocb, iter, offset);
376 +               ret = ext4_direct_IO_write(iocb, iter, offset);
377         trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
378         return ret;
380 -- 
381 2.6.6