1 ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate
3 From: Lukas Czerner <lczerner@redhat.com>
5 Introduce new FALLOC_FL_ZERO_RANGE flag for fallocate. This has the same
6 functionality as xfs ioctl XFS_IOC_ZERO_RANGE.
8 It can be used to convert a range of file to zeros preferably without
9 issuing data IO. Blocks should be preallocated for the regions that span
10 holes in the file, and the entire range is preferable converted to
13 This can be also used to preallocate blocks past EOF in the same way as
14 with fallocate. Flag FALLOC_FL_KEEP_SIZE which should cause the inode
15 size to remain the same.
17 Also add appropriate tracepoints.
19 Signed-off-by: Lukas Czerner <lczerner@redhat.com>
20 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
23 fs/ext4/extents.c | 273 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
24 fs/ext4/inode.c | 17 +++--
25 include/trace/events/ext4.h | 68 +++++++++--------
26 4 files changed, 307 insertions(+), 53 deletions(-)
28 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
29 index beec427..1b3cbf8 100644
32 @@ -568,6 +568,8 @@ enum {
33 #define EXT4_GET_BLOCKS_NO_LOCK 0x0100
34 /* Do not put hole in extent cache */
35 #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
36 + /* Convert written extents to unwritten */
37 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
40 * The bit position of these flags must not overlap with any of the
41 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
42 index 1c09a09..491208c 100644
43 --- a/fs/ext4/extents.c
44 +++ b/fs/ext4/extents.c
45 @@ -3602,6 +3602,8 @@ out:
46 * b> Splits in two extents: Write is happening at either end of the extent
47 * c> Splits in three extents: Somone is writing in middle of the extent
49 + * This works the same way in the case of initialized -> unwritten conversion.
51 * One of more index blocks maybe needed if the extent tree grow after
52 * the uninitialized extent split. To prevent ENOSPC occur at the IO
53 * complete, we need to split the uninitialized extent before DIO submit
54 @@ -3612,7 +3614,7 @@ out:
56 * Returns the size of uninitialized extent to be written on success.
58 -static int ext4_split_unwritten_extents(handle_t *handle,
59 +static int ext4_split_convert_extents(handle_t *handle,
61 struct ext4_map_blocks *map,
62 struct ext4_ext_path *path,
63 @@ -3624,9 +3626,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
65 int split_flag = 0, depth;
67 - ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
68 - "block %llu, max_blocks %u\n", inode->i_ino,
69 - (unsigned long long)map->m_lblk, map->m_len);
70 + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
71 + __func__, inode->i_ino,
72 + (unsigned long long)map->m_lblk, map->m_len);
74 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
75 inode->i_sb->s_blocksize_bits;
76 @@ -3641,14 +3643,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
77 ee_block = le32_to_cpu(ex->ee_block);
78 ee_len = ext4_ext_get_actual_len(ex);
80 - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
81 - split_flag |= EXT4_EXT_MARK_UNINIT2;
82 - if (flags & EXT4_GET_BLOCKS_CONVERT)
83 - split_flag |= EXT4_EXT_DATA_VALID2;
84 + /* Convert to unwritten */
85 + if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
86 + split_flag |= EXT4_EXT_DATA_VALID1;
87 + /* Convert to initialized */
88 + } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
89 + split_flag |= ee_block + ee_len <= eof_block ?
90 + EXT4_EXT_MAY_ZEROOUT : 0;
91 + split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
93 flags |= EXT4_GET_BLOCKS_PRE_IO;
94 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
97 +static int ext4_convert_initialized_extents(handle_t *handle,
98 + struct inode *inode,
99 + struct ext4_map_blocks *map,
100 + struct ext4_ext_path *path)
102 + struct ext4_extent *ex;
103 + ext4_lblk_t ee_block;
104 + unsigned int ee_len;
108 + depth = ext_depth(inode);
109 + ex = path[depth].p_ext;
110 + ee_block = le32_to_cpu(ex->ee_block);
111 + ee_len = ext4_ext_get_actual_len(ex);
113 + ext_debug("%s: inode %lu, logical"
114 + "block %llu, max_blocks %u\n", __func__, inode->i_ino,
115 + (unsigned long long)ee_block, ee_len);
117 + if (ee_block != map->m_lblk || ee_len > map->m_len) {
118 + err = ext4_split_convert_extents(handle, inode, map, path,
119 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
122 + ext4_ext_drop_refs(path);
123 + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
124 + if (IS_ERR(path)) {
125 + err = PTR_ERR(path);
128 + depth = ext_depth(inode);
129 + ex = path[depth].p_ext;
132 + err = ext4_ext_get_access(handle, inode, path + depth);
135 + /* first mark the extent as uninitialized */
136 + ext4_ext_mark_uninitialized(ex);
138 + /* note: ext4_ext_correct_indexes() isn't needed here because
139 + * borders are not changed
141 + ext4_ext_try_to_merge(handle, inode, path, ex);
143 + /* Mark modified extent as dirty */
144 + err = ext4_ext_dirty(handle, inode, path + path->p_depth);
146 + ext4_ext_show_leaf(inode, path);
151 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
153 struct ext4_map_blocks *map,
154 @@ -3682,8 +3743,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
155 inode->i_ino, (unsigned long long)ee_block, ee_len,
156 (unsigned long long)map->m_lblk, map->m_len);
158 - err = ext4_split_unwritten_extents(handle, inode, map, path,
159 - EXT4_GET_BLOCKS_CONVERT);
160 + err = ext4_split_convert_extents(handle, inode, map, path,
161 + EXT4_GET_BLOCKS_CONVERT);
164 ext4_ext_drop_refs(path);
165 @@ -3884,6 +3945,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
169 +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
170 + struct ext4_map_blocks *map,
171 + struct ext4_ext_path *path, int flags,
172 + unsigned int allocated, ext4_fsblk_t newblock)
178 + * Make sure that the extent is no bigger than we support with
179 + * uninitialized extent
181 + if (map->m_len > EXT_UNINIT_MAX_LEN)
182 + map->m_len = EXT_UNINIT_MAX_LEN / 2;
184 + ret = ext4_convert_initialized_extents(handle, inode, map,
187 + ext4_update_inode_fsync_trans(handle, inode, 1);
188 + err = check_eofblocks_fl(handle, inode, map->m_lblk,
192 + map->m_flags |= EXT4_MAP_UNWRITTEN;
193 + if (allocated > map->m_len)
194 + allocated = map->m_len;
195 + map->m_len = allocated;
197 + return err ? err : allocated;
201 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
202 struct ext4_map_blocks *map,
203 struct ext4_ext_path *path, int flags,
204 @@ -3910,8 +4003,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
206 /* get_block() before submit the IO, split the extent */
207 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
208 - ret = ext4_split_unwritten_extents(handle, inode, map,
210 + ret = ext4_split_convert_extents(handle, inode, map,
211 + path, flags | EXT4_GET_BLOCKS_CONVERT);
215 @@ -4199,6 +4292,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
216 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
217 unsigned short ee_len;
221 * Uninitialized extents are treated as holes, except that
222 * we split out initialized portions during a write.
223 @@ -4215,7 +4309,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
224 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
225 ee_block, ee_len, newblock);
227 - if (!ext4_ext_is_uninitialized(ex))
229 + * If the extent is initialized check whether the
230 + * caller wants to convert it to unwritten.
232 + if ((!ext4_ext_is_uninitialized(ex)) &&
233 + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
234 + allocated = ext4_ext_convert_initialized_extent(
235 + handle, inode, map, path, flags,
236 + allocated, newblock);
238 + } else if (!ext4_ext_is_uninitialized(ex))
241 ret = ext4_ext_handle_uninitialized_extents(
242 @@ -4604,6 +4708,144 @@ retry:
243 return ret > 0 ? ret2 : ret;
246 +static long ext4_zero_range(struct file *file, loff_t offset,
247 + loff_t len, int mode)
249 + struct inode *inode = file_inode(file);
250 + handle_t *handle = NULL;
251 + unsigned int max_blocks;
252 + loff_t new_size = 0;
258 + struct address_space *mapping = inode->i_mapping;
259 + unsigned int blkbits = inode->i_blkbits;
261 + trace_ext4_zero_range(inode, offset, len, mode);
264 + * Write out all dirty pages to avoid race conditions
265 + * Then release them.
267 + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
268 + ret = filemap_write_and_wait_range(mapping, offset,
275 + * Round up offset. This is not fallocate, we neet to zero out
276 + * blocks, so convert interior block aligned part of the range to
277 + * unwritten and possibly manually zero out unaligned parts of the
280 + start = round_up(offset, 1 << blkbits);
281 + end = round_down((offset + len), 1 << blkbits);
283 + if (start < offset || end > offset + len)
285 + partial = (offset + len) & ((1 << blkbits) - 1);
287 + lblk = start >> blkbits;
288 + max_blocks = (end >> blkbits);
289 + if (max_blocks < lblk)
292 + max_blocks -= lblk;
294 + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
295 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
296 + if (mode & FALLOC_FL_KEEP_SIZE)
297 + flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
299 + mutex_lock(&inode->i_mutex);
302 + * Indirect files do not support unwritten extnets
304 + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
309 + if (!(mode & FALLOC_FL_KEEP_SIZE) &&
310 + offset + len > i_size_read(inode)) {
311 + new_size = offset + len;
312 + ret = inode_newsize_ok(inode, new_size);
316 + * If we have a partial block after EOF we have to allocate
317 + * the entire block.
323 + if (max_blocks > 0) {
325 + /* Now release the pages and zero block aligned part of pages*/
326 + truncate_pagecache_range(inode, start, end - 1);
328 + /* Wait all existing dio workers, newcomers will block on i_mutex */
329 + ext4_inode_block_unlocked_dio(inode);
330 + inode_dio_wait(inode);
333 + * Remove entire range from the extent status tree.
335 + ret = ext4_es_remove_extent(inode, lblk, max_blocks);
339 + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
345 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
346 + if (IS_ERR(handle)) {
347 + ret = PTR_ERR(handle);
348 + ext4_std_error(inode->i_sb, ret);
352 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
354 + if (!ret && new_size) {
355 + if (new_size > i_size_read(inode))
356 + i_size_write(inode, new_size);
357 + if (new_size > EXT4_I(inode)->i_disksize)
358 + ext4_update_i_disksize(inode, new_size);
359 + } else if (!ret && !new_size) {
361 + * Mark that we allocate beyond EOF so the subsequent truncate
362 + * can proceed even if the new size is the same as i_size.
364 + if ((offset + len) > i_size_read(inode))
365 + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
368 + ext4_mark_inode_dirty(handle, inode);
370 + /* Zero out partial block at the edges of the range */
371 + ret = ext4_zero_partial_blocks(handle, inode, offset, len);
373 + if (file->f_flags & O_SYNC)
374 + ext4_handle_sync(handle);
376 + ext4_journal_stop(handle);
378 + ext4_inode_resume_unlocked_dio(inode);
380 + mutex_unlock(&inode->i_mutex);
385 * preallocate space for a file. This implements ext4's fallocate file
386 * operation, which gets called from sys_fallocate system call.
387 @@ -4625,7 +4867,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
389 /* Return error if mode is not supported */
390 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
391 - FALLOC_FL_COLLAPSE_RANGE))
392 + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
395 if (mode & FALLOC_FL_PUNCH_HOLE)
396 @@ -4645,6 +4887,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
397 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
400 + if (mode & FALLOC_FL_ZERO_RANGE)
401 + return ext4_zero_range(file, offset, len, mode);
403 trace_ext4_fallocate_enter(inode, offset, len, mode);
404 lblk = offset >> blkbits;
406 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
407 index ab3e835..7cc2455 100644
408 --- a/fs/ext4/inode.c
409 +++ b/fs/ext4/inode.c
410 @@ -503,6 +503,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
412 struct extent_status es;
415 #ifdef ES_AGGRESSIVE_TEST
416 struct ext4_map_blocks orig_map;
418 @@ -558,7 +559,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
419 EXT4_GET_BLOCKS_KEEP_SIZE);
425 if (unlikely(retval != map->m_len)) {
426 @@ -585,7 +585,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
429 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
430 - int ret = check_block_validity(inode, map);
431 + ret = check_block_validity(inode, map);
435 @@ -602,7 +602,13 @@ found:
436 * with buffer head unmapped.
438 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
441 + * If we need to convert extent to unwritten
442 + * we continue and do the actual work in
443 + * ext4_ext_map_blocks()
445 + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
449 * Here we clear m_flags because after allocating an new extent,
450 @@ -658,7 +664,6 @@ found:
451 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
457 if (unlikely(retval != map->m_len)) {
458 @@ -693,7 +698,7 @@ found:
460 up_write((&EXT4_I(inode)->i_data_sem));
461 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
462 - int ret = check_block_validity(inode, map);
463 + ret = check_block_validity(inode, map);
467 @@ -3507,7 +3512,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
468 if (!S_ISREG(inode->i_mode))
471 - trace_ext4_punch_hole(inode, offset, length);
472 + trace_ext4_punch_hole(inode, offset, length, 0);
475 * Write out all dirty pages to avoid race conditions
476 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
477 index e9d7ee7..010ea89 100644
478 --- a/include/trace/events/ext4.h
479 +++ b/include/trace/events/ext4.h
480 @@ -21,6 +21,10 @@ struct extent_status;
481 #define FALLOC_FL_COLLAPSE_RANGE 0x08
484 +#ifndef FALLOC_FL_ZERO_RANGE
485 +#define FALLOC_FL_ZERO_RANGE 0x10
488 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
490 #define show_mballoc_flags(flags) __print_flags(flags, "|", \
491 @@ -77,7 +81,8 @@ struct extent_status;
492 { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \
493 { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \
494 { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \
495 - { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"})
496 + { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \
497 + { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"})
500 TRACE_EVENT(ext4_free_inode,
501 @@ -1339,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit,
502 __entry->rw, __entry->ret)
505 -TRACE_EVENT(ext4_fallocate_enter,
506 +DECLARE_EVENT_CLASS(ext4__fallocate_mode,
507 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
509 TP_ARGS(inode, offset, len, mode),
510 @@ -1347,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter,
512 __field( dev_t, dev )
513 __field( ino_t, ino )
514 - __field( loff_t, pos )
515 - __field( loff_t, len )
516 + __field( loff_t, offset )
517 + __field( loff_t, len )
522 __entry->dev = inode->i_sb->s_dev;
523 __entry->ino = inode->i_ino;
524 - __entry->pos = offset;
525 + __entry->offset = offset;
527 __entry->mode = mode;
530 - TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %s",
531 + TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
532 MAJOR(__entry->dev), MINOR(__entry->dev),
533 - (unsigned long) __entry->ino, __entry->pos,
534 - __entry->len, show_falloc_mode(__entry->mode))
535 + (unsigned long) __entry->ino,
536 + __entry->offset, __entry->len,
537 + show_falloc_mode(__entry->mode))
540 +DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,
542 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
544 + TP_ARGS(inode, offset, len, mode)
547 +DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,
549 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
551 + TP_ARGS(inode, offset, len, mode)
554 +DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
556 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
558 + TP_ARGS(inode, offset, len, mode)
561 TRACE_EVENT(ext4_fallocate_exit,
562 @@ -1395,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit,
566 -TRACE_EVENT(ext4_punch_hole,
567 - TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
569 - TP_ARGS(inode, offset, len),
572 - __field( dev_t, dev )
573 - __field( ino_t, ino )
574 - __field( loff_t, offset )
575 - __field( loff_t, len )
579 - __entry->dev = inode->i_sb->s_dev;
580 - __entry->ino = inode->i_ino;
581 - __entry->offset = offset;
582 - __entry->len = len;
585 - TP_printk("dev %d,%d ino %lu offset %lld len %lld",
586 - MAJOR(__entry->dev), MINOR(__entry->dev),
587 - (unsigned long) __entry->ino,
588 - __entry->offset, __entry->len)
591 TRACE_EVENT(ext4_unlink_enter,
592 TP_PROTO(struct inode *parent, struct dentry *dentry),