1 ext4: Introduce FALLOC_FL_ZERO_RANGE flag for fallocate
3 From: Lukas Czerner <lczerner@redhat.com>
5 Introduce new FALLOC_FL_ZERO_RANGE flag for fallocate. This has the same
6 functionality as xfs ioctl XFS_IOC_ZERO_RANGE.
8 It can be used to convert a range of file to zeros preferably without
9 issuing data IO. Blocks should be preallocated for the regions that span
10 holes in the file, and the entire range is preferable converted to
13 This can be also used to preallocate blocks past EOF in the same way as
14 with fallocate. Flag FALLOC_FL_KEEP_SIZE which should cause the inode
15 size to remain the same.
17 Also add appropriate tracepoints.
19 Signed-off-by: Lukas Czerner <lczerner@redhat.com>
20 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
23 fs/ext4/extents.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
24 fs/ext4/inode.c | 17 ++++---
25 include/trace/events/ext4.h | 74 ++++++++++++++++--------------
26 4 files changed, 308 insertions(+), 57 deletions(-)
28 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
29 index beec427..1b3cbf8 100644
32 @@ -568,6 +568,8 @@ enum {
33 #define EXT4_GET_BLOCKS_NO_LOCK 0x0100
34 /* Do not put hole in extent cache */
35 #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
36 + /* Convert written extents to unwritten */
37 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
40 * The bit position of these flags must not overlap with any of the
41 diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
42 index 1a616af..7863316 100644
43 --- a/fs/ext4/extents.c
44 +++ b/fs/ext4/extents.c
47 #include <trace/events/ext4.h>
49 -#ifndef FALLOC_FL_COLLAPSE_RANGE
50 -#define FALLOC_FL_COLLAPSE_RANGE 0x08
54 * used by extent splitting.
56 @@ -3606,6 +3602,8 @@ out:
57 * b> Splits in two extents: Write is happening at either end of the extent
58 * c> Splits in three extents: Somone is writing in middle of the extent
60 + * This works the same way in the case of initialized -> unwritten conversion.
62 * One of more index blocks maybe needed if the extent tree grow after
63 * the uninitialized extent split. To prevent ENOSPC occur at the IO
64 * complete, we need to split the uninitialized extent before DIO submit
65 @@ -3616,7 +3614,7 @@ out:
67 * Returns the size of uninitialized extent to be written on success.
69 -static int ext4_split_unwritten_extents(handle_t *handle,
70 +static int ext4_split_convert_extents(handle_t *handle,
72 struct ext4_map_blocks *map,
73 struct ext4_ext_path *path,
74 @@ -3628,9 +3626,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
76 int split_flag = 0, depth;
78 - ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
79 - "block %llu, max_blocks %u\n", inode->i_ino,
80 - (unsigned long long)map->m_lblk, map->m_len);
81 + ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
82 + __func__, inode->i_ino,
83 + (unsigned long long)map->m_lblk, map->m_len);
85 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
86 inode->i_sb->s_blocksize_bits;
87 @@ -3645,14 +3643,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
88 ee_block = le32_to_cpu(ex->ee_block);
89 ee_len = ext4_ext_get_actual_len(ex);
91 - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
92 - split_flag |= EXT4_EXT_MARK_UNINIT2;
93 - if (flags & EXT4_GET_BLOCKS_CONVERT)
94 - split_flag |= EXT4_EXT_DATA_VALID2;
95 + /* Convert to unwritten */
96 + if (flags | EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
97 + split_flag |= EXT4_EXT_DATA_VALID1;
98 + /* Convert to initialized */
99 + } else if (flags | EXT4_GET_BLOCKS_CONVERT) {
100 + split_flag |= ee_block + ee_len <= eof_block ?
101 + EXT4_EXT_MAY_ZEROOUT : 0;
102 + split_flag |= (EXT4_EXT_MARK_UNINIT2 & EXT4_EXT_DATA_VALID2);
104 flags |= EXT4_GET_BLOCKS_PRE_IO;
105 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
108 +static int ext4_convert_initialized_extents(handle_t *handle,
109 + struct inode *inode,
110 + struct ext4_map_blocks *map,
111 + struct ext4_ext_path *path)
113 + struct ext4_extent *ex;
114 + ext4_lblk_t ee_block;
115 + unsigned int ee_len;
119 + depth = ext_depth(inode);
120 + ex = path[depth].p_ext;
121 + ee_block = le32_to_cpu(ex->ee_block);
122 + ee_len = ext4_ext_get_actual_len(ex);
124 + ext_debug("%s: inode %lu, logical"
125 + "block %llu, max_blocks %u\n", __func__, inode->i_ino,
126 + (unsigned long long)ee_block, ee_len);
128 + if (ee_block != map->m_lblk || ee_len > map->m_len) {
129 + err = ext4_split_convert_extents(handle, inode, map, path,
130 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
133 + ext4_ext_drop_refs(path);
134 + path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
135 + if (IS_ERR(path)) {
136 + err = PTR_ERR(path);
139 + depth = ext_depth(inode);
140 + ex = path[depth].p_ext;
143 + err = ext4_ext_get_access(handle, inode, path + depth);
146 + /* first mark the extent as uninitialized */
147 + ext4_ext_mark_uninitialized(ex);
149 + /* note: ext4_ext_correct_indexes() isn't needed here because
150 + * borders are not changed
152 + ext4_ext_try_to_merge(handle, inode, path, ex);
154 + /* Mark modified extent as dirty */
155 + err = ext4_ext_dirty(handle, inode, path + path->p_depth);
157 + ext4_ext_show_leaf(inode, path);
162 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
164 struct ext4_map_blocks *map,
165 @@ -3686,8 +3743,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
166 inode->i_ino, (unsigned long long)ee_block, ee_len,
167 (unsigned long long)map->m_lblk, map->m_len);
169 - err = ext4_split_unwritten_extents(handle, inode, map, path,
170 - EXT4_GET_BLOCKS_CONVERT);
171 + err = ext4_split_convert_extents(handle, inode, map, path,
172 + EXT4_GET_BLOCKS_CONVERT);
175 ext4_ext_drop_refs(path);
176 @@ -3888,6 +3945,42 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
180 +ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
181 + struct ext4_map_blocks *map,
182 + struct ext4_ext_path *path, int flags,
183 + unsigned int allocated, ext4_fsblk_t newblock)
189 + * Make sure that the extent is no bigger than we support with
190 + * uninitialized extent
192 + if (map->m_len > EXT_UNINIT_MAX_LEN)
193 + map->m_len = EXT_UNINIT_MAX_LEN / 2;
195 + ret = ext4_convert_initialized_extents(handle, inode, map,
198 + ext4_update_inode_fsync_trans(handle, inode, 1);
199 + err = check_eofblocks_fl(handle, inode, map->m_lblk,
203 + map->m_flags |= EXT4_MAP_UNWRITTEN;
204 + if (allocated > map->m_len)
205 + allocated = map->m_len;
206 + map->m_len = allocated;
209 + ext4_ext_drop_refs(path);
212 + return err ? err : allocated;
216 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
217 struct ext4_map_blocks *map,
218 struct ext4_ext_path *path, int flags,
219 @@ -3914,8 +4007,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
221 /* get_block() before submit the IO, split the extent */
222 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
223 - ret = ext4_split_unwritten_extents(handle, inode, map,
225 + ret = ext4_split_convert_extents(handle, inode, map,
226 + path, flags | EXT4_GET_BLOCKS_CONVERT);
230 @@ -4219,7 +4312,17 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
231 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
232 ee_block, ee_len, newblock);
234 - if (!ext4_ext_is_uninitialized(ex))
236 + * If the extent is initialized check whether the
237 + * caller wants to convert it to unwritten.
239 + if ((!ext4_ext_is_uninitialized(ex)) &&
240 + (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
241 + allocated = ext4_ext_convert_initialized_extent(
242 + handle, inode, map, path, flags,
243 + allocated, newblock);
245 + } else if (!ext4_ext_is_uninitialized(ex))
248 ret = ext4_ext_handle_uninitialized_extents(
249 @@ -4608,6 +4711,136 @@ retry:
250 return ret > 0 ? ret2 : ret;
253 +static long ext4_zero_range(struct file *file, loff_t offset,
254 + loff_t len, int mode)
256 + struct inode *inode = file_inode(file);
257 + handle_t *handle = NULL;
258 + unsigned int max_blocks;
259 + loff_t new_size = 0;
265 + struct address_space *mapping = inode->i_mapping;
266 + unsigned int blkbits = inode->i_blkbits;
268 + trace_ext4_zero_range(inode, offset, len, mode);
271 + * Write out all dirty pages to avoid race conditions
272 + * Then release them.
274 + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
275 + ret = filemap_write_and_wait_range(mapping, offset,
282 + * Round up offset. This is not fallocate, we neet to zero out
283 + * blocks, so convert interior block aligned part of the range to
284 + * unwritten and possibly manually zero out unaligned parts of the
287 + start = round_up(offset, 1 << blkbits);
288 + end = round_down((offset + len), 1 << blkbits);
290 + if (start < offset || end > offset + len)
292 + partial = (offset + len) & ((1 << blkbits) - 1);
294 + lblk = start >> blkbits;
295 + max_blocks = (end >> blkbits);
296 + if (max_blocks < lblk)
299 + max_blocks -= lblk;
301 + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
302 + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
303 + if (mode & FALLOC_FL_KEEP_SIZE)
304 + flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
306 + mutex_lock(&inode->i_mutex);
309 + * Indirect files do not support unwritten extnets
311 + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
316 + if (!(mode & FALLOC_FL_KEEP_SIZE) &&
317 + offset + len > i_size_read(inode)) {
318 + new_size = offset + len;
319 + ret = inode_newsize_ok(inode, new_size);
323 + * If we have a partial block after EOF we have to allocate
324 + * the entire block.
330 + if (max_blocks > 0) {
332 + /* Now release the pages and zero block aligned part of pages*/
333 + truncate_pagecache_range(inode, start, end - 1);
335 + /* Wait all existing dio workers, newcomers will block on i_mutex */
336 + ext4_inode_block_unlocked_dio(inode);
337 + inode_dio_wait(inode);
340 + * Remove entire range from the extent status tree.
342 + ret = ext4_es_remove_extent(inode, lblk, max_blocks);
346 + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
352 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
353 + if (IS_ERR(handle)) {
354 + ret = PTR_ERR(handle);
355 + ext4_std_error(inode->i_sb, ret);
359 + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
361 + if (!ret && new_size) {
362 + if (new_size > i_size_read(inode))
363 + i_size_write(inode, new_size);
364 + if (new_size > EXT4_I(inode)->i_disksize)
365 + ext4_update_i_disksize(inode, new_size);
367 + ext4_mark_inode_dirty(handle, inode);
369 + /* Zero out partial block at the edges of the range */
370 + ret = ext4_zero_partial_blocks(handle, inode, offset, len);
372 + if (file->f_flags & O_SYNC)
373 + ext4_handle_sync(handle);
375 + ext4_journal_stop(handle);
377 + ext4_inode_resume_unlocked_dio(inode);
379 + mutex_unlock(&inode->i_mutex);
384 * preallocate space for a file. This implements ext4's fallocate file
385 * operation, which gets called from sys_fallocate system call.
386 @@ -4629,7 +4862,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
388 /* Return error if mode is not supported */
389 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
390 - FALLOC_FL_COLLAPSE_RANGE))
391 + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
394 if (mode & FALLOC_FL_PUNCH_HOLE)
395 @@ -4649,6 +4882,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
396 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
399 + if (mode & FALLOC_FL_ZERO_RANGE)
400 + return ext4_zero_range(file, offset, len, mode);
402 trace_ext4_fallocate_enter(inode, offset, len, mode);
403 lblk = offset >> blkbits;
405 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
406 index ab3e835..7cc2455 100644
407 --- a/fs/ext4/inode.c
408 +++ b/fs/ext4/inode.c
409 @@ -503,6 +503,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
411 struct extent_status es;
414 #ifdef ES_AGGRESSIVE_TEST
415 struct ext4_map_blocks orig_map;
417 @@ -558,7 +559,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
418 EXT4_GET_BLOCKS_KEEP_SIZE);
424 if (unlikely(retval != map->m_len)) {
425 @@ -585,7 +585,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
428 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
429 - int ret = check_block_validity(inode, map);
430 + ret = check_block_validity(inode, map);
434 @@ -602,7 +602,13 @@ found:
435 * with buffer head unmapped.
437 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
440 + * If we need to convert extent to unwritten
441 + * we continue and do the actual work in
442 + * ext4_ext_map_blocks()
444 + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
448 * Here we clear m_flags because after allocating an new extent,
449 @@ -658,7 +664,6 @@ found:
450 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
456 if (unlikely(retval != map->m_len)) {
457 @@ -693,7 +698,7 @@ found:
459 up_write((&EXT4_I(inode)->i_data_sem));
460 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
461 - int ret = check_block_validity(inode, map);
462 + ret = check_block_validity(inode, map);
466 @@ -3507,7 +3512,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
467 if (!S_ISREG(inode->i_mode))
470 - trace_ext4_punch_hole(inode, offset, length);
471 + trace_ext4_punch_hole(inode, offset, length, 0);
474 * Write out all dirty pages to avoid race conditions
475 diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
476 index dc29153..207288f 100644
477 --- a/include/trace/events/ext4.h
478 +++ b/include/trace/events/ext4.h
479 @@ -16,6 +16,15 @@ struct mpage_da_data;
480 struct ext4_map_blocks;
481 struct extent_status;
483 +/* shim until we merge in the xfs_collapse_range branch */
484 +#ifndef FALLOC_FL_COLLAPSE_RANGE
485 +#define FALLOC_FL_COLLAPSE_RANGE 0x08
488 +#ifndef FALLOC_FL_ZERO_RANGE
489 +#define FALLOC_FL_ZERO_RANGE 0x10
492 #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
494 #define show_mballoc_flags(flags) __print_flags(flags, "|", \
495 @@ -71,7 +80,9 @@ struct extent_status;
496 #define show_falloc_mode(mode) __print_flags(mode, "|", \
497 { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \
498 { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \
499 - { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"})
500 + { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \
501 + { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \
502 + { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"})
505 TRACE_EVENT(ext4_free_inode,
506 @@ -1333,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit,
507 __entry->rw, __entry->ret)
510 -TRACE_EVENT(ext4_fallocate_enter,
511 +DECLARE_EVENT_CLASS(ext4__fallocate_mode,
512 TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
514 TP_ARGS(inode, offset, len, mode),
515 @@ -1341,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter,
517 __field( dev_t, dev )
518 __field( ino_t, ino )
519 - __field( loff_t, pos )
520 - __field( loff_t, len )
521 + __field( loff_t, offset )
522 + __field( loff_t, len )
527 __entry->dev = inode->i_sb->s_dev;
528 __entry->ino = inode->i_ino;
529 - __entry->pos = offset;
530 + __entry->offset = offset;
532 __entry->mode = mode;
535 - TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %s",
536 + TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
537 MAJOR(__entry->dev), MINOR(__entry->dev),
538 - (unsigned long) __entry->ino, __entry->pos,
539 - __entry->len, show_falloc_mode(__entry->mode))
540 + (unsigned long) __entry->ino,
541 + __entry->offset, __entry->len,
542 + show_falloc_mode(__entry->mode))
545 +DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,
547 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
549 + TP_ARGS(inode, offset, len, mode)
552 +DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,
554 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
556 + TP_ARGS(inode, offset, len, mode)
559 +DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
561 + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
563 + TP_ARGS(inode, offset, len, mode)
566 TRACE_EVENT(ext4_fallocate_exit,
567 @@ -1389,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit,
571 -TRACE_EVENT(ext4_punch_hole,
572 - TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
574 - TP_ARGS(inode, offset, len),
577 - __field( dev_t, dev )
578 - __field( ino_t, ino )
579 - __field( loff_t, offset )
580 - __field( loff_t, len )
584 - __entry->dev = inode->i_sb->s_dev;
585 - __entry->ino = inode->i_ino;
586 - __entry->offset = offset;
587 - __entry->len = len;
590 - TP_printk("dev %d,%d ino %lu offset %lld len %lld",
591 - MAJOR(__entry->dev), MINOR(__entry->dev),
592 - (unsigned long) __entry->ino,
593 - __entry->offset, __entry->len)
596 TRACE_EVENT(ext4_unlink_enter,
597 TP_PROTO(struct inode *parent, struct dentry *dentry),