add patch remove-unneeded-test-of-ret-variable
[ext4-patch-queue.git] / add-support-collapse-range
blobd9f932db6580108b1455fcbb222b3b500d8bc020
1 ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate
3 From: Namjae Jeon <namjae.jeon@samsung.com>
5 This patch implements fallocate's FALLOC_FL_COLLAPSE_RANGE for Ext4.
6  
7 The semantics of this flag are following:
8 1) It collapses the range lying between offset and length by removing any data
9    blocks which are present in this range and than updates all the logical
10    offsets of extents beyond "offset + len" to nullify the hole created by
11    removing blocks. In short, it does not leave a hole.
12 2) It should be used exclusively. No other fallocate flag in combination.
13 3) Offset and length supplied to fallocate should be fs block size aligned
14    in case of xfs and ext4.
15 4) Collaspe range does not work beyond i_size.
17 Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
18 Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
19 Tested-by: Dongsu Park <dongsu.park@profitbricks.com>
20 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
22 ---
23  fs/ext4/ext4.h              |   3 ++
24  fs/ext4/extents.c           | 304 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
25  fs/ext4/move_extent.c       |   2 +-
26  include/trace/events/ext4.h |  33 +++++++++++-
27  4 files changed, 339 insertions(+), 3 deletions(-)
29 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
30 index b7207db..beec427 100644
31 --- a/fs/ext4/ext4.h
32 +++ b/fs/ext4/ext4.h
33 @@ -2758,6 +2758,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
34  extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
35                         __u64 start, __u64 len);
36  extern int ext4_ext_precache(struct inode *inode);
37 +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
39  /* move_extent.c */
40  extern void ext4_double_down_write_data_sem(struct inode *first,
41 @@ -2767,6 +2768,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
42  extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
43                              __u64 start_orig, __u64 start_donor,
44                              __u64 len, __u64 *moved_len);
45 +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
46 +                           struct ext4_extent **extent);
48  /* page-io.c */
49  extern int __init ext4_init_pageio(void);
50 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
51 index 2e0608e..7015917 100644
52 --- a/fs/ext4/extents.c
53 +++ b/fs/ext4/extents.c
54 @@ -4581,12 +4581,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
55         unsigned int credits, blkbits = inode->i_blkbits;
57         /* Return error if mode is not supported */
58 -       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
59 +       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
60 +                    FALLOC_FL_COLLAPSE_RANGE))
61                 return -EOPNOTSUPP;
63         if (mode & FALLOC_FL_PUNCH_HOLE)
64                 return ext4_punch_hole(inode, offset, len);
66 +       if (mode & FALLOC_FL_COLLAPSE_RANGE)
67 +               return ext4_collapse_range(inode, offset, len);
69         ret = ext4_convert_inline_data(inode);
70         if (ret)
71                 return ret;
72 @@ -4885,3 +4889,301 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
73         ext4_es_lru_add(inode);
74         return error;
75  }
77 +/*
78 + * ext4_access_path:
79 + * Function to access the path buffer for marking it dirty.
80 + * It also checks if there are sufficient credits left in the journal handle
81 + * to update path.
82 + */
83 +static int
84 +ext4_access_path(handle_t *handle, struct inode *inode,
85 +               struct ext4_ext_path *path)
87 +       int credits, err;
89 +       /*
90 +        * Check if need to extend journal credits
91 +        * 3 for leaf, sb, and inode plus 2 (bmap and group
92 +        * descriptor) for each block group; assume two block
93 +        * groups
94 +        */
95 +       if (handle->h_buffer_credits < 7) {
96 +               credits = ext4_writepage_trans_blocks(inode);
97 +               err = ext4_ext_truncate_extend_restart(handle, inode, credits);
98 +               /* EAGAIN is success */
99 +               if (err && err != -EAGAIN)
100 +                       return err;
101 +       }
103 +       err = ext4_ext_get_access(handle, inode, path);
104 +       return err;
108 + * ext4_ext_shift_path_extents:
109 + * Shift the extents of a path structure lying between path[depth].p_ext
110 + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
111 + * from starting block for each extent.
112 + */
113 +static int
114 +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
115 +                           struct inode *inode, handle_t *handle,
116 +                           ext4_lblk_t *start)
118 +       int depth, err = 0;
119 +       struct ext4_extent *ex_start, *ex_last;
120 +       bool update = 0;
121 +       depth = path->p_depth;
123 +       while (depth >= 0) {
124 +               if (depth == path->p_depth) {
125 +                       ex_start = path[depth].p_ext;
126 +                       if (!ex_start)
127 +                               return -EIO;
129 +                       ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
130 +                       if (!ex_last)
131 +                               return -EIO;
133 +                       err = ext4_access_path(handle, inode, path + depth);
134 +                       if (err)
135 +                               goto out;
137 +                       if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
138 +                               update = 1;
140 +                       *start = ex_last->ee_block +
141 +                               ext4_ext_get_actual_len(ex_last);
143 +                       while (ex_start <= ex_last) {
144 +                               ex_start->ee_block -= shift;
145 +                               if (ex_start >
146 +                                       EXT_FIRST_EXTENT(path[depth].p_hdr)) {
147 +                                       if (ext4_ext_try_to_merge_right(inode,
148 +                                               path, ex_start - 1))
149 +                                               ex_last--;
150 +                               }
151 +                               ex_start++;
152 +                       }
153 +                       err = ext4_ext_dirty(handle, inode, path + depth);
154 +                       if (err)
155 +                               goto out;
157 +                       if (--depth < 0 || !update)
158 +                               break;
159 +               }
161 +               /* Update index too */
162 +               err = ext4_access_path(handle, inode, path + depth);
163 +               if (err)
164 +                       goto out;
166 +               path[depth].p_idx->ei_block -= shift;
167 +               err = ext4_ext_dirty(handle, inode, path + depth);
168 +               if (err)
169 +                       goto out;
171 +               /* we are done if current index is not a starting index */
172 +               if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
173 +                       break;
175 +               depth--;
176 +       }
178 +out:
179 +       return err;
183 + * ext4_ext_shift_extents:
184 + * All the extents which lies in the range from start to the last allocated
185 + * block for the file are shifted downwards by shift blocks.
186 + * On success, 0 is returned, error otherwise.
187 + */
188 +static int
189 +ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
190 +                      ext4_lblk_t start, ext4_lblk_t shift)
192 +       struct ext4_ext_path *path;
193 +       int ret = 0, depth;
194 +       struct ext4_extent *extent;
195 +       ext4_lblk_t stop_block, current_block;
196 +       ext4_lblk_t ex_start, ex_end;
198 +       /* Let path point to the last extent */
199 +       path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
200 +       if (IS_ERR(path))
201 +               return PTR_ERR(path);
203 +       depth = path->p_depth;
204 +       extent = path[depth].p_ext;
205 +       if (!extent) {
206 +               ext4_ext_drop_refs(path);
207 +               kfree(path);
208 +               return ret;
209 +       }
211 +       stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
212 +       ext4_ext_drop_refs(path);
213 +       kfree(path);
215 +       /* Nothing to shift, if hole is at the end of file */
216 +       if (start >= stop_block)
217 +               return ret;
219 +       /*
220 +        * Don't start shifting extents until we make sure the hole is big
221 +        * enough to accomodate the shift.
222 +        */
223 +       path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
224 +       depth = path->p_depth;
225 +       extent =  path[depth].p_ext;
226 +       ex_start = extent->ee_block;
227 +       ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
228 +       ext4_ext_drop_refs(path);
229 +       kfree(path);
231 +       if ((start == ex_start && shift > ex_start) ||
232 +           (shift > start - ex_end))
233 +               return -EINVAL;
235 +       /* Its safe to start updating extents */
236 +       while (start < stop_block) {
237 +               path = ext4_ext_find_extent(inode, start, NULL, 0);
238 +               if (IS_ERR(path))
239 +                       return PTR_ERR(path);
240 +               depth = path->p_depth;
241 +               extent = path[depth].p_ext;
242 +               current_block = extent->ee_block;
243 +               if (start > current_block) {
244 +                       /* Hole, move to the next extent */
245 +                       ret = mext_next_extent(inode, path, &extent);
246 +                       if (ret != 0) {
247 +                               ext4_ext_drop_refs(path);
248 +                               kfree(path);
249 +                               if (ret == 1)
250 +                                       ret = 0;
251 +                               break;
252 +                       }
253 +               }
254 +               ret = ext4_ext_shift_path_extents(path, shift, inode,
255 +                               handle, &start);
256 +               ext4_ext_drop_refs(path);
257 +               kfree(path);
258 +               if (ret)
259 +                       break;
260 +       }
262 +       return ret;
266 + * ext4_collapse_range:
267 + * This implements the fallocate's collapse range functionality for ext4
268 + * Returns: 0 and non-zero on error.
269 + */
270 +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
272 +       struct super_block *sb = inode->i_sb;
273 +       ext4_lblk_t punch_start, punch_stop;
274 +       handle_t *handle;
275 +       unsigned int credits;
276 +       loff_t new_size;
277 +       int ret;
279 +       BUG_ON(offset + len > i_size_read(inode));
281 +       /* Collapse range works only on fs block size aligned offsets. */
282 +       if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
283 +           len & (EXT4_BLOCK_SIZE(sb) - 1))
284 +               return -EINVAL;
286 +       if (!S_ISREG(inode->i_mode))
287 +               return -EOPNOTSUPP;
289 +       trace_ext4_collapse_range(inode, offset, len);
291 +       punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
292 +       punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
294 +       /* Write out all dirty pages */
295 +       ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
296 +       if (ret)
297 +               return ret;
299 +       /* Take mutex lock */
300 +       mutex_lock(&inode->i_mutex);
302 +       /* It's not possible punch hole on append only file */
303 +       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
304 +               ret = -EPERM;
305 +               goto out_mutex;
306 +       }
308 +       if (IS_SWAPFILE(inode)) {
309 +               ret = -ETXTBSY;
310 +               goto out_mutex;
311 +       }
313 +       /* Currently just for extent based files */
314 +       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
315 +               ret = -EOPNOTSUPP;
316 +               goto out_mutex;
317 +       }
319 +       truncate_pagecache_range(inode, offset, -1);
321 +       /* Wait for existing dio to complete */
322 +       ext4_inode_block_unlocked_dio(inode);
323 +       inode_dio_wait(inode);
325 +       credits = ext4_writepage_trans_blocks(inode);
326 +       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
327 +       if (IS_ERR(handle)) {
328 +               ret = PTR_ERR(handle);
329 +               goto out_dio;
330 +       }
332 +       down_write(&EXT4_I(inode)->i_data_sem);
333 +       ext4_discard_preallocations(inode);
335 +       ret = ext4_es_remove_extent(inode, punch_start,
336 +                                   EXT_MAX_BLOCKS - punch_start - 1);
337 +       if (ret) {
338 +               up_write(&EXT4_I(inode)->i_data_sem);
339 +               goto out_stop;
340 +       }
342 +       ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
343 +       if (ret) {
344 +               up_write(&EXT4_I(inode)->i_data_sem);
345 +               goto out_stop;
346 +       }
348 +       ret = ext4_ext_shift_extents(inode, handle, punch_stop,
349 +                                    punch_stop - punch_start);
350 +       if (ret) {
351 +               up_write(&EXT4_I(inode)->i_data_sem);
352 +               goto out_stop;
353 +       }
355 +       new_size = i_size_read(inode) - len;
356 +       truncate_setsize(inode, new_size);
357 +       EXT4_I(inode)->i_disksize = new_size;
359 +       ext4_discard_preallocations(inode);
360 +       up_write(&EXT4_I(inode)->i_data_sem);
361 +       if (IS_SYNC(inode))
362 +               ext4_handle_sync(handle);
363 +       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
364 +       ext4_mark_inode_dirty(handle, inode);
366 +out_stop:
367 +       ext4_journal_stop(handle);
368 +out_dio:
369 +       ext4_inode_resume_unlocked_dio(inode);
370 +out_mutex:
371 +       mutex_unlock(&inode->i_mutex);
372 +       return ret;
374 diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
375 index f39a88a..58ee7dc 100644
376 --- a/fs/ext4/move_extent.c
377 +++ b/fs/ext4/move_extent.c
378 @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
379   * ext4_ext_path structure refers to the last extent, or a negative error
380   * value on failure.
381   */
382 -static int
383 +int
384  mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
385                       struct ext4_extent **extent)
387 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
388 index 451e020..e9d7ee7 100644
389 --- a/include/trace/events/ext4.h
390 +++ b/include/trace/events/ext4.h
391 @@ -16,6 +16,11 @@ struct mpage_da_data;
392  struct ext4_map_blocks;
393  struct extent_status;
395 +/* shim until we merge in the xfs_collapse_range branch */
396 +#ifndef FALLOC_FL_COLLAPSE_RANGE
397 +#define FALLOC_FL_COLLAPSE_RANGE       0x08
398 +#endif
400  #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
402  #define show_mballoc_flags(flags) __print_flags(flags, "|",    \
403 @@ -71,7 +76,8 @@ struct extent_status;
404  #define show_falloc_mode(mode) __print_flags(mode, "|",                \
405         { FALLOC_FL_KEEP_SIZE,          "KEEP_SIZE"},           \
406         { FALLOC_FL_PUNCH_HOLE,         "PUNCH_HOLE"},          \
407 -       { FALLOC_FL_NO_HIDE_STALE,      "NO_HIDE_STALE"})
408 +       { FALLOC_FL_NO_HIDE_STALE,      "NO_HIDE_STALE"},       \
409 +       { FALLOC_FL_COLLAPSE_RANGE,     "COLLAPSE_RANGE"})
412  TRACE_EVENT(ext4_free_inode,
413 @@ -2415,6 +2421,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
414                   __entry->shrunk_nr, __entry->cache_cnt)
415  );
417 +TRACE_EVENT(ext4_collapse_range,
418 +       TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
420 +       TP_ARGS(inode, offset, len),
422 +       TP_STRUCT__entry(
423 +               __field(dev_t,  dev)
424 +               __field(ino_t,  ino)
425 +               __field(loff_t, offset)
426 +               __field(loff_t, len)
427 +       ),
429 +       TP_fast_assign(
430 +               __entry->dev    = inode->i_sb->s_dev;
431 +               __entry->ino    = inode->i_ino;
432 +               __entry->offset = offset;
433 +               __entry->len    = len;
434 +       ),
436 +       TP_printk("dev %d,%d ino %lu offset %lld len %lld",
437 +                 MAJOR(__entry->dev), MINOR(__entry->dev),
438 +                 (unsigned long) __entry->ino,
439 +                 __entry->offset, __entry->len)
442  #endif /* _TRACE_EXT4_H */
444  /* This part must be outside protection */