1 ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate
3 From: Namjae Jeon <namjae.jeon@samsung.com>
5 This patch implements fallocate's FALLOC_FL_INSERT_RANGE for Ext4.
7 1) Make sure that both offset and len are block size aligned.
8 2) Update the i_size of inode by len bytes.
9 3) Compute the file's logical block number against offset. If the computed
10 block number is not the starting block of the extent, split the extent
11 such that the block number is the starting block of the extent.
12 4) Shift all the extents which are lying between [offset, last allocated extent]
13 towards right by len bytes. This step will make a hole of len bytes
16 Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
17 Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
20 fs/ext4/extents.c | 305 ++++++++++++++++++++++++++++++++++++++++++++++++----------
21 include/trace/events/ext4.h | 25 +++++
22 3 files changed, 284 insertions(+), 52 deletions(-)
24 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
25 index 730c88d..3ab7cd8 100644
28 @@ -90,6 +90,11 @@ typedef __u32 ext4_lblk_t;
29 /* data type for block group number */
30 typedef unsigned int ext4_group_t;
32 +enum SHIFT_DIRECTION {
38 * Flags used in mballoc's allocation_context flags field.
40 @@ -2947,6 +2952,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
41 __u64 start, __u64 len);
42 extern int ext4_ext_precache(struct inode *inode);
43 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
44 +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
45 extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
46 struct inode *inode2, ext4_lblk_t lblk1,
47 ext4_lblk_t lblk2, ext4_lblk_t count,
48 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
49 index f38a6d6..da91d49 100644
50 --- a/fs/ext4/extents.c
51 +++ b/fs/ext4/extents.c
52 @@ -4912,12 +4912,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
53 * bug we should fix....
55 if (ext4_encrypted_inode(inode) &&
56 - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
57 + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
58 + FALLOC_FL_ZERO_RANGE)))
61 /* Return error if mode is not supported */
62 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
63 - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
64 + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
65 + FALLOC_FL_INSERT_RANGE))
68 if (mode & FALLOC_FL_PUNCH_HOLE)
69 @@ -4930,6 +4932,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
70 if (mode & FALLOC_FL_COLLAPSE_RANGE)
71 return ext4_collapse_range(inode, offset, len);
73 + if (mode & FALLOC_FL_INSERT_RANGE)
74 + return ext4_insert_range(inode, offset, len);
76 if (mode & FALLOC_FL_ZERO_RANGE)
77 return ext4_zero_range(file, offset, len, mode);
79 @@ -5224,13 +5229,13 @@ ext4_access_path(handle_t *handle, struct inode *inode,
81 * ext4_ext_shift_path_extents:
82 * Shift the extents of a path structure lying between path[depth].p_ext
83 - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
84 - * from starting block for each extent.
85 + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
86 + * if it is right shift or left shift operation.
89 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
90 struct inode *inode, handle_t *handle,
92 + enum SHIFT_DIRECTION SHIFT)
95 struct ext4_extent *ex_start, *ex_last;
96 @@ -5252,19 +5257,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
97 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
100 - *start = le32_to_cpu(ex_last->ee_block) +
101 - ext4_ext_get_actual_len(ex_last);
103 while (ex_start <= ex_last) {
104 - le32_add_cpu(&ex_start->ee_block, -shift);
105 - /* Try to merge to the left. */
107 - EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
108 - ext4_ext_try_to_merge_right(inode,
109 - path, ex_start - 1))
110 + if (SHIFT == SHIFT_LEFT) {
111 + le32_add_cpu(&ex_start->ee_block,
113 + /* Try to merge to the left. */
115 + EXT_FIRST_EXTENT(path[depth].p_hdr))
117 + ext4_ext_try_to_merge_right(inode,
118 + path, ex_start - 1))
123 + le32_add_cpu(&ex_last->ee_block, shift);
124 + ext4_ext_try_to_merge_right(inode, path,
131 err = ext4_ext_dirty(handle, inode, path + depth);
133 @@ -5279,7 +5290,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
137 - le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
138 + if (SHIFT == SHIFT_LEFT)
139 + le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
141 + le32_add_cpu(&path[depth].p_idx->ei_block, shift);
142 err = ext4_ext_dirty(handle, inode, path + depth);
145 @@ -5297,19 +5311,20 @@ out:
148 * ext4_ext_shift_extents:
149 - * All the extents which lies in the range from start to the last allocated
150 - * block for the file are shifted downwards by shift blocks.
151 + * All the extents which lies in the range from @start to the last allocated
152 + * block for the @inode are shifted either towards left or right (depending
153 + * upon @SHIFT) by @shift blocks.
154 * On success, 0 is returned, error otherwise.
157 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
158 - ext4_lblk_t start, ext4_lblk_t shift)
159 + ext4_lblk_t start, ext4_lblk_t shift,
160 + enum SHIFT_DIRECTION SHIFT)
162 struct ext4_ext_path *path;
164 struct ext4_extent *extent;
165 - ext4_lblk_t stop_block;
166 - ext4_lblk_t ex_start, ex_end;
167 + ext4_lblk_t stop, *iterator, ex_start, ex_end;
169 /* Let path point to the last extent */
170 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
171 @@ -5321,58 +5336,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
175 - stop_block = le32_to_cpu(extent->ee_block) +
176 + stop = le32_to_cpu(extent->ee_block) +
177 ext4_ext_get_actual_len(extent);
179 - /* Nothing to shift, if hole is at the end of file */
180 - if (start >= stop_block)
183 + * In case of left shift, Don't start shifting extents until we make
184 + * sure the hole is big enough to accommodate the shift.
186 + if (SHIFT == SHIFT_LEFT) {
187 + path = ext4_find_extent(inode, start - 1, &path, 0);
189 + return PTR_ERR(path);
190 + depth = path->p_depth;
191 + extent = path[depth].p_ext;
193 + ex_start = le32_to_cpu(extent->ee_block);
194 + ex_end = le32_to_cpu(extent->ee_block) +
195 + ext4_ext_get_actual_len(extent);
202 - * Don't start shifting extents until we make sure the hole is big
203 - * enough to accomodate the shift.
205 - path = ext4_find_extent(inode, start - 1, &path, 0);
207 - return PTR_ERR(path);
208 - depth = path->p_depth;
209 - extent = path[depth].p_ext;
211 - ex_start = le32_to_cpu(extent->ee_block);
212 - ex_end = le32_to_cpu(extent->ee_block) +
213 - ext4_ext_get_actual_len(extent);
217 + if ((start == ex_start && shift > ex_start) ||
218 + (shift > start - ex_end)) {
219 + ext4_ext_drop_refs(path);
225 - if ((start == ex_start && shift > ex_start) ||
226 - (shift > start - ex_end))
229 + * In case of left shift, iterator points to start and it is increased
230 + * till we reach stop. In case of right shift, iterator points to stop
231 + * and it is decreased till we reach start.
233 + if (SHIFT == SHIFT_LEFT)
238 /* Its safe to start updating extents */
239 - while (start < stop_block) {
240 - path = ext4_find_extent(inode, start, &path, 0);
241 + while (start < stop) {
242 + path = ext4_find_extent(inode, *iterator, &path, 0);
244 return PTR_ERR(path);
245 depth = path->p_depth;
246 extent = path[depth].p_ext;
248 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
249 - (unsigned long) start);
250 + (unsigned long) *iterator);
253 - if (start > le32_to_cpu(extent->ee_block)) {
254 + if (SHIFT == SHIFT_LEFT && *iterator >
255 + le32_to_cpu(extent->ee_block)) {
256 /* Hole, move to the next extent */
257 if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
260 - start = ext4_ext_next_allocated_block(path);
261 + *iterator = ext4_ext_next_allocated_block(path);
266 + if (SHIFT == SHIFT_LEFT) {
267 + extent = EXT_LAST_EXTENT(path[depth].p_hdr);
268 + *iterator = le32_to_cpu(extent->ee_block) +
269 + ext4_ext_get_actual_len(extent);
271 + extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
272 + *iterator = le32_to_cpu(extent->ee_block) > 0 ?
273 + le32_to_cpu(extent->ee_block) - 1 : 0;
274 + /* Update path extent in case we need to stop */
275 + while (le32_to_cpu(extent->ee_block) < start)
277 + path[depth].p_ext = extent;
279 ret = ext4_ext_shift_path_extents(path, shift, inode,
285 @@ -5485,7 +5526,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
286 ext4_discard_preallocations(inode);
288 ret = ext4_ext_shift_extents(inode, handle, punch_stop,
289 - punch_stop - punch_start);
290 + punch_stop - punch_start, SHIFT_LEFT);
292 up_write(&EXT4_I(inode)->i_data_sem);
294 @@ -5510,6 +5551,166 @@ out_mutex:
299 + * ext4_insert_range:
300 + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
301 + * The data blocks starting from @offset to the EOF are shifted by @len
302 + * towards right to create a hole in the @inode. Inode size is increased
304 + * Returns 0 on success, error otherwise.
306 +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
308 + struct super_block *sb = inode->i_sb;
310 + struct ext4_ext_path *path;
311 + struct ext4_extent *extent;
312 + ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
313 + unsigned int credits, ee_len;
314 + int ret = 0, depth, split_flag = 0;
317 + /* Insert range works only on fs block size aligned offsets. */
318 + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
319 + len & (EXT4_CLUSTER_SIZE(sb) - 1))
322 + if (!S_ISREG(inode->i_mode))
323 + return -EOPNOTSUPP;
325 + trace_ext4_insert_range(inode, offset, len);
327 + offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
328 + len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
330 + /* Call ext4_force_commit to flush all data in case of data=journal */
331 + if (ext4_should_journal_data(inode)) {
332 + ret = ext4_force_commit(inode->i_sb);
338 + * Need to round down to align start offset to page size boundary
339 + * for page size > block size.
341 + ioffset = round_down(offset, PAGE_SIZE);
343 + /* Write out all dirty pages */
344 + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
349 + /* Take mutex lock */
350 + mutex_lock(&inode->i_mutex);
352 + /* Currently just for extent based files */
353 + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
358 + /* Check for wrap through zero */
359 + if (inode->i_size + len > inode->i_sb->s_maxbytes) {
364 + /* Offset should be less than i_size */
365 + if (offset >= i_size_read(inode)) {
370 + truncate_pagecache(inode, ioffset);
372 + /* Wait for existing dio to complete */
373 + ext4_inode_block_unlocked_dio(inode);
374 + inode_dio_wait(inode);
376 + credits = ext4_writepage_trans_blocks(inode);
377 + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
378 + if (IS_ERR(handle)) {
379 + ret = PTR_ERR(handle);
383 + /* Expand file to avoid data loss if there is error while shifting */
384 + inode->i_size += len;
385 + EXT4_I(inode)->i_disksize += len;
386 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
387 + ret = ext4_mark_inode_dirty(handle, inode);
391 + down_write(&EXT4_I(inode)->i_data_sem);
392 + ext4_discard_preallocations(inode);
394 + path = ext4_find_extent(inode, offset_lblk, NULL, 0);
395 + if (IS_ERR(path)) {
396 + up_write(&EXT4_I(inode)->i_data_sem);
400 + depth = ext_depth(inode);
401 + extent = path[depth].p_ext;
403 + ee_start_lblk = le32_to_cpu(extent->ee_block);
404 + ee_len = ext4_ext_get_actual_len(extent);
407 + * If offset_lblk is not the starting block of extent, split
408 + * the extent @offset_lblk
410 + if ((offset_lblk > ee_start_lblk) &&
411 + (offset_lblk < (ee_start_lblk + ee_len))) {
412 + if (ext4_ext_is_unwritten(extent))
413 + split_flag = EXT4_EXT_MARK_UNWRIT1 |
414 + EXT4_EXT_MARK_UNWRIT2;
415 + ret = ext4_split_extent_at(handle, inode, &path,
416 + offset_lblk, split_flag,
418 + EXT4_GET_BLOCKS_PRE_IO |
419 + EXT4_GET_BLOCKS_METADATA_NOFAIL);
422 + ext4_ext_drop_refs(path);
425 + up_write(&EXT4_I(inode)->i_data_sem);
430 + ret = ext4_es_remove_extent(inode, offset_lblk,
431 + EXT_MAX_BLOCKS - offset_lblk);
433 + up_write(&EXT4_I(inode)->i_data_sem);
438 + * if offset_lblk lies in a hole which is at start of file, use
439 + * ee_start_lblk to shift extents
441 + ret = ext4_ext_shift_extents(inode, handle,
442 + ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
443 + len_lblk, SHIFT_RIGHT);
445 + up_write(&EXT4_I(inode)->i_data_sem);
446 + if (IS_SYNC(inode))
447 + ext4_handle_sync(handle);
450 + ext4_journal_stop(handle);
452 + ext4_inode_resume_unlocked_dio(inode);
454 + mutex_unlock(&inode->i_mutex);
459 * ext4_swap_extents - Swap extents between two inodes
461 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
462 index 08ec3dd..0faf570 100644
463 --- a/include/trace/events/ext4.h
464 +++ b/include/trace/events/ext4.h
465 @@ -2478,6 +2478,31 @@ TRACE_EVENT(ext4_collapse_range,
466 __entry->offset, __entry->len)
469 +TRACE_EVENT(ext4_insert_range,
470 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
472 + TP_ARGS(inode, offset, len),
475 + __field(dev_t, dev)
476 + __field(ino_t, ino)
477 + __field(loff_t, offset)
478 + __field(loff_t, len)
482 + __entry->dev = inode->i_sb->s_dev;
483 + __entry->ino = inode->i_ino;
484 + __entry->offset = offset;
485 + __entry->len = len;
488 + TP_printk("dev %d,%d ino %lu offset %lld len %lld",
489 + MAJOR(__entry->dev), MINOR(__entry->dev),
490 + (unsigned long) __entry->ino,
491 + __entry->offset, __entry->len)
494 TRACE_EVENT(ext4_es_shrink,
495 TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
496 int nr_skipped, int retried),