add patch improve-warning-directory-handling-messages
[ext4-patch-queue.git] / add-support-FALLOC_FL_INSERT_RANGE-for-fallocate
blob41f418e186bddc21cad390ca3672a2b6e3cef304
1 ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate
3 From: Namjae Jeon <namjae.jeon@samsung.com>
5 This patch implements fallocate's FALLOC_FL_INSERT_RANGE for Ext4.
7 1) Make sure that both offset and len are block size aligned.
8 2) Update the i_size of inode by len bytes.
9 3) Compute the file's logical block number against offset. If the computed
10    block number is not the starting block of the extent, split the extent
11    such that the block number is the starting block of the extent.
12 4) Shift all the extents which are lying between [offset, last allocated extent]
13    towards right by len bytes. This step will make a hole of len bytes
14    at offset.
16 Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
17 Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
18 ---
19  fs/ext4/ext4.h              |   6 ++
20  fs/ext4/extents.c           | 305 ++++++++++++++++++++++++++++++++++++++++++++++++----------
21  include/trace/events/ext4.h |  25 +++++
22  3 files changed, 284 insertions(+), 52 deletions(-)
24 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
25 index 730c88d..3ab7cd8 100644
26 --- a/fs/ext4/ext4.h
27 +++ b/fs/ext4/ext4.h
28 @@ -90,6 +90,11 @@ typedef __u32 ext4_lblk_t;
29  /* data type for block group number */
30  typedef unsigned int ext4_group_t;
32 +enum SHIFT_DIRECTION {
33 +       SHIFT_LEFT = 0,
34 +       SHIFT_RIGHT,
35 +};
37  /*
38   * Flags used in mballoc's allocation_context flags field.
39   *
40 @@ -2947,6 +2952,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
41                         __u64 start, __u64 len);
42  extern int ext4_ext_precache(struct inode *inode);
43  extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
44 +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
45  extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
46                                 struct inode *inode2, ext4_lblk_t lblk1,
47                              ext4_lblk_t lblk2,  ext4_lblk_t count,
48 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
49 index f38a6d6..da91d49 100644
50 --- a/fs/ext4/extents.c
51 +++ b/fs/ext4/extents.c
52 @@ -4912,12 +4912,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
53          * bug we should fix....
54          */
55         if (ext4_encrypted_inode(inode) &&
56 -           (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
57 +           (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
58 +                    FALLOC_FL_ZERO_RANGE)))
59                 return -EOPNOTSUPP;
61         /* Return error if mode is not supported */
62         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
63 -                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
64 +                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
65 +                    FALLOC_FL_INSERT_RANGE))
66                 return -EOPNOTSUPP;
68         if (mode & FALLOC_FL_PUNCH_HOLE)
69 @@ -4930,6 +4932,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
70         if (mode & FALLOC_FL_COLLAPSE_RANGE)
71                 return ext4_collapse_range(inode, offset, len);
73 +       if (mode & FALLOC_FL_INSERT_RANGE)
74 +               return ext4_insert_range(inode, offset, len);
76         if (mode & FALLOC_FL_ZERO_RANGE)
77                 return ext4_zero_range(file, offset, len, mode);
79 @@ -5224,13 +5229,13 @@ ext4_access_path(handle_t *handle, struct inode *inode,
80  /*
81   * ext4_ext_shift_path_extents:
82   * Shift the extents of a path structure lying between path[depth].p_ext
83 - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
84 - * from starting block for each extent.
85 + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
86 + * if it is right shift or left shift operation.
87   */
88  static int
89  ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
90                             struct inode *inode, handle_t *handle,
91 -                           ext4_lblk_t *start)
92 +                           enum SHIFT_DIRECTION SHIFT)
93  {
94         int depth, err = 0;
95         struct ext4_extent *ex_start, *ex_last;
96 @@ -5252,19 +5257,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
97                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
98                                 update = 1;
100 -                       *start = le32_to_cpu(ex_last->ee_block) +
101 -                               ext4_ext_get_actual_len(ex_last);
103                         while (ex_start <= ex_last) {
104 -                               le32_add_cpu(&ex_start->ee_block, -shift);
105 -                               /* Try to merge to the left. */
106 -                               if ((ex_start >
107 -                                    EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
108 -                                   ext4_ext_try_to_merge_right(inode,
109 -                                                       path, ex_start - 1))
110 +                               if (SHIFT == SHIFT_LEFT) {
111 +                                       le32_add_cpu(&ex_start->ee_block,
112 +                                               -shift);
113 +                                       /* Try to merge to the left. */
114 +                                       if ((ex_start >
115 +                                           EXT_FIRST_EXTENT(path[depth].p_hdr))
116 +                                           &&
117 +                                           ext4_ext_try_to_merge_right(inode,
118 +                                           path, ex_start - 1))
119 +                                               ex_last--;
120 +                                       else
121 +                                               ex_start++;
122 +                               } else {
123 +                                       le32_add_cpu(&ex_last->ee_block, shift);
124 +                                       ext4_ext_try_to_merge_right(inode, path,
125 +                                               ex_last);
126                                         ex_last--;
127 -                               else
128 -                                       ex_start++;
129 +                               }
130                         }
131                         err = ext4_ext_dirty(handle, inode, path + depth);
132                         if (err)
133 @@ -5279,7 +5290,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
134                 if (err)
135                         goto out;
137 -               le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
138 +               if (SHIFT == SHIFT_LEFT)
139 +                       le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
140 +               else
141 +                       le32_add_cpu(&path[depth].p_idx->ei_block, shift);
142                 err = ext4_ext_dirty(handle, inode, path + depth);
143                 if (err)
144                         goto out;
145 @@ -5297,19 +5311,20 @@ out:
147  /*
148   * ext4_ext_shift_extents:
149 - * All the extents which lies in the range from start to the last allocated
150 - * block for the file are shifted downwards by shift blocks.
151 + * All the extents which lies in the range from @start to the last allocated
152 + * block for the @inode are shifted either towards left or right (depending
153 + * upon @SHIFT) by @shift blocks.
154   * On success, 0 is returned, error otherwise.
155   */
156  static int
157  ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
158 -                      ext4_lblk_t start, ext4_lblk_t shift)
159 +                      ext4_lblk_t start, ext4_lblk_t shift,
160 +                      enum SHIFT_DIRECTION SHIFT)
162         struct ext4_ext_path *path;
163         int ret = 0, depth;
164         struct ext4_extent *extent;
165 -       ext4_lblk_t stop_block;
166 -       ext4_lblk_t ex_start, ex_end;
167 +       ext4_lblk_t stop, *iterator, ex_start, ex_end;
169         /* Let path point to the last extent */
170         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
171 @@ -5321,58 +5336,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
172         if (!extent)
173                 goto out;
175 -       stop_block = le32_to_cpu(extent->ee_block) +
176 +       stop = le32_to_cpu(extent->ee_block) +
177                         ext4_ext_get_actual_len(extent);
179 -       /* Nothing to shift, if hole is at the end of file */
180 -       if (start >= stop_block)
181 -               goto out;
182 +       /*
183 +        * In case of left shift, Don't start shifting extents until we make
184 +        * sure the hole is big enough to accommodate the shift.
185 +       */
186 +       if (SHIFT == SHIFT_LEFT) {
187 +               path = ext4_find_extent(inode, start - 1, &path, 0);
188 +               if (IS_ERR(path))
189 +                       return PTR_ERR(path);
190 +               depth = path->p_depth;
191 +               extent =  path[depth].p_ext;
192 +               if (extent) {
193 +                       ex_start = le32_to_cpu(extent->ee_block);
194 +                       ex_end = le32_to_cpu(extent->ee_block) +
195 +                               ext4_ext_get_actual_len(extent);
196 +               } else {
197 +                       ex_start = 0;
198 +                       ex_end = 0;
199 +               }
201 -       /*
202 -        * Don't start shifting extents until we make sure the hole is big
203 -        * enough to accomodate the shift.
204 -        */
205 -       path = ext4_find_extent(inode, start - 1, &path, 0);
206 -       if (IS_ERR(path))
207 -               return PTR_ERR(path);
208 -       depth = path->p_depth;
209 -       extent =  path[depth].p_ext;
210 -       if (extent) {
211 -               ex_start = le32_to_cpu(extent->ee_block);
212 -               ex_end = le32_to_cpu(extent->ee_block) +
213 -                       ext4_ext_get_actual_len(extent);
214 -       } else {
215 -               ex_start = 0;
216 -               ex_end = 0;
217 +               if ((start == ex_start && shift > ex_start) ||
218 +                   (shift > start - ex_end)) {
219 +                       ext4_ext_drop_refs(path);
220 +                       kfree(path);
221 +                       return -EINVAL;
222 +               }
223         }
225 -       if ((start == ex_start && shift > ex_start) ||
226 -           (shift > start - ex_end))
227 -               return -EINVAL;
228 +       /*
229 +        * In case of left shift, iterator points to start and it is increased
230 +        * till we reach stop. In case of right shift, iterator points to stop
231 +        * and it is decreased till we reach start.
232 +        */
233 +       if (SHIFT == SHIFT_LEFT)
234 +               iterator = &start;
235 +       else
236 +               iterator = &stop;
238         /* Its safe to start updating extents */
239 -       while (start < stop_block) {
240 -               path = ext4_find_extent(inode, start, &path, 0);
241 +       while (start < stop) {
242 +               path = ext4_find_extent(inode, *iterator, &path, 0);
243                 if (IS_ERR(path))
244                         return PTR_ERR(path);
245                 depth = path->p_depth;
246                 extent = path[depth].p_ext;
247                 if (!extent) {
248                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
249 -                                        (unsigned long) start);
250 +                                        (unsigned long) *iterator);
251                         return -EIO;
252                 }
253 -               if (start > le32_to_cpu(extent->ee_block)) {
254 +               if (SHIFT == SHIFT_LEFT && *iterator >
255 +                   le32_to_cpu(extent->ee_block)) {
256                         /* Hole, move to the next extent */
257                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
258                                 path[depth].p_ext++;
259                         } else {
260 -                               start = ext4_ext_next_allocated_block(path);
261 +                               *iterator = ext4_ext_next_allocated_block(path);
262                                 continue;
263                         }
264                 }
266 +               if (SHIFT == SHIFT_LEFT) {
267 +                       extent = EXT_LAST_EXTENT(path[depth].p_hdr);
268 +                       *iterator = le32_to_cpu(extent->ee_block) +
269 +                                       ext4_ext_get_actual_len(extent);
270 +               } else {
271 +                       extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
272 +                       *iterator =  le32_to_cpu(extent->ee_block) > 0 ?
273 +                               le32_to_cpu(extent->ee_block) - 1 : 0;
274 +                       /* Update path extent in case we need to stop */
275 +                       while (le32_to_cpu(extent->ee_block) < start)
276 +                               extent++;
277 +                       path[depth].p_ext = extent;
278 +               }
279                 ret = ext4_ext_shift_path_extents(path, shift, inode,
280 -                               handle, &start);
281 +                               handle, SHIFT);
282                 if (ret)
283                         break;
284         }
285 @@ -5485,7 +5526,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
286         ext4_discard_preallocations(inode);
288         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
289 -                                    punch_stop - punch_start);
290 +                                    punch_stop - punch_start, SHIFT_LEFT);
291         if (ret) {
292                 up_write(&EXT4_I(inode)->i_data_sem);
293                 goto out_stop;
294 @@ -5510,6 +5551,166 @@ out_mutex:
295         return ret;
299 + * ext4_insert_range:
300 + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
301 + * The data blocks starting from @offset to the EOF are shifted by @len
302 + * towards right to create a hole in the @inode. Inode size is increased
303 + * by len bytes.
304 + * Returns 0 on success, error otherwise.
305 + */
306 +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
308 +       struct super_block *sb = inode->i_sb;
309 +       handle_t *handle;
310 +       struct ext4_ext_path *path;
311 +       struct ext4_extent *extent;
312 +       ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
313 +       unsigned int credits, ee_len;
314 +       int ret = 0, depth, split_flag = 0;
315 +       loff_t ioffset;
317 +       /* Insert range works only on fs block size aligned offsets. */
318 +       if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
319 +                       len & (EXT4_CLUSTER_SIZE(sb) - 1))
320 +               return -EINVAL;
322 +       if (!S_ISREG(inode->i_mode))
323 +               return -EOPNOTSUPP;
325 +       trace_ext4_insert_range(inode, offset, len);
327 +       offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
328 +       len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
330 +       /* Call ext4_force_commit to flush all data in case of data=journal */
331 +       if (ext4_should_journal_data(inode)) {
332 +               ret = ext4_force_commit(inode->i_sb);
333 +               if (ret)
334 +                       return ret;
335 +       }
337 +       /*
338 +        * Need to round down to align start offset to page size boundary
339 +        * for page size > block size.
340 +        */
341 +       ioffset = round_down(offset, PAGE_SIZE);
343 +       /* Write out all dirty pages */
344 +       ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
345 +                       LLONG_MAX);
346 +       if (ret)
347 +               return ret;
349 +       /* Take mutex lock */
350 +       mutex_lock(&inode->i_mutex);
352 +       /* Currently just for extent based files */
353 +       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
354 +               ret = -EOPNOTSUPP;
355 +               goto out_mutex;
356 +       }
358 +       /* Check for wrap through zero */
359 +       if (inode->i_size + len > inode->i_sb->s_maxbytes) {
360 +               ret = -EFBIG;
361 +               goto out_mutex;
362 +       }
364 +       /* Offset should be less than i_size */
365 +       if (offset >= i_size_read(inode)) {
366 +               ret = -EINVAL;
367 +               goto out_mutex;
368 +       }
370 +       truncate_pagecache(inode, ioffset);
372 +       /* Wait for existing dio to complete */
373 +       ext4_inode_block_unlocked_dio(inode);
374 +       inode_dio_wait(inode);
376 +       credits = ext4_writepage_trans_blocks(inode);
377 +       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
378 +       if (IS_ERR(handle)) {
379 +               ret = PTR_ERR(handle);
380 +               goto out_dio;
381 +       }
383 +       /* Expand file to avoid data loss if there is error while shifting */
384 +       inode->i_size += len;
385 +       EXT4_I(inode)->i_disksize += len;
386 +       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
387 +       ret = ext4_mark_inode_dirty(handle, inode);
388 +       if (ret)
389 +               goto out_stop;
391 +       down_write(&EXT4_I(inode)->i_data_sem);
392 +       ext4_discard_preallocations(inode);
394 +       path = ext4_find_extent(inode, offset_lblk, NULL, 0);
395 +       if (IS_ERR(path)) {
396 +               up_write(&EXT4_I(inode)->i_data_sem);
397 +               goto out_stop;
398 +       }
400 +       depth = ext_depth(inode);
401 +       extent = path[depth].p_ext;
402 +       if (extent) {
403 +               ee_start_lblk = le32_to_cpu(extent->ee_block);
404 +               ee_len = ext4_ext_get_actual_len(extent);
406 +               /*
407 +                * If offset_lblk is not the starting block of extent, split
408 +                * the extent @offset_lblk
409 +                */
410 +               if ((offset_lblk > ee_start_lblk) &&
411 +                               (offset_lblk < (ee_start_lblk + ee_len))) {
412 +                       if (ext4_ext_is_unwritten(extent))
413 +                               split_flag = EXT4_EXT_MARK_UNWRIT1 |
414 +                                       EXT4_EXT_MARK_UNWRIT2;
415 +                       ret = ext4_split_extent_at(handle, inode, &path,
416 +                                       offset_lblk, split_flag,
417 +                                       EXT4_EX_NOCACHE |
418 +                                       EXT4_GET_BLOCKS_PRE_IO |
419 +                                       EXT4_GET_BLOCKS_METADATA_NOFAIL);
420 +               }
422 +               ext4_ext_drop_refs(path);
423 +               kfree(path);
424 +               if (ret < 0) {
425 +                       up_write(&EXT4_I(inode)->i_data_sem);
426 +                       goto out_stop;
427 +               }
428 +       }
430 +       ret = ext4_es_remove_extent(inode, offset_lblk,
431 +                       EXT_MAX_BLOCKS - offset_lblk);
432 +       if (ret) {
433 +               up_write(&EXT4_I(inode)->i_data_sem);
434 +               goto out_stop;
435 +       }
437 +       /*
438 +        * if offset_lblk lies in a hole which is at start of file, use
439 +        * ee_start_lblk to shift extents
440 +        */
441 +       ret = ext4_ext_shift_extents(inode, handle,
442 +               ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
443 +               len_lblk, SHIFT_RIGHT);
445 +       up_write(&EXT4_I(inode)->i_data_sem);
446 +       if (IS_SYNC(inode))
447 +               ext4_handle_sync(handle);
449 +out_stop:
450 +       ext4_journal_stop(handle);
451 +out_dio:
452 +       ext4_inode_resume_unlocked_dio(inode);
453 +out_mutex:
454 +       mutex_unlock(&inode->i_mutex);
455 +       return ret;
458  /**
459   * ext4_swap_extents - Swap extents between two inodes
460   *
461 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
462 index 08ec3dd..0faf570 100644
463 --- a/include/trace/events/ext4.h
464 +++ b/include/trace/events/ext4.h
465 @@ -2478,6 +2478,31 @@ TRACE_EVENT(ext4_collapse_range,
466                   __entry->offset, __entry->len)
467  );
469 +TRACE_EVENT(ext4_insert_range,
470 +       TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
472 +       TP_ARGS(inode, offset, len),
474 +       TP_STRUCT__entry(
475 +               __field(dev_t,  dev)
476 +               __field(ino_t,  ino)
477 +               __field(loff_t, offset)
478 +               __field(loff_t, len)
479 +       ),
481 +       TP_fast_assign(
482 +               __entry->dev    = inode->i_sb->s_dev;
483 +               __entry->ino    = inode->i_ino;
484 +               __entry->offset = offset;
485 +               __entry->len    = len;
486 +       ),
488 +       TP_printk("dev %d,%d ino %lu offset %lld len %lld",
489 +                 MAJOR(__entry->dev), MINOR(__entry->dev),
490 +                 (unsigned long) __entry->ino,
491 +                 __entry->offset, __entry->len)
494  TRACE_EVENT(ext4_es_shrink,
495         TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
496                  int nr_skipped, int retried),