1 ext4: DAX iomap write support
3 From: Jan Kara <jack@suse.cz>
5 Implement DAX writes using the new iomap infrastructure instead of
6 overloading the direct IO path.
8 Signed-off-by: Jan Kara <jack@suse.cz>
9 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
11 fs/ext4/file.c | 40 ++++++++++++++++++
12 fs/ext4/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
13 2 files changed, 160 insertions(+), 6 deletions(-)
15 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
16 index 1f25c644cb12..1953fe34f9fe 100644
19 @@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
20 return iov_iter_count(from);
25 +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
27 + struct inode *inode = file_inode(iocb->ki_filp);
29 + bool overwrite = false;
32 + ret = ext4_write_checks(iocb, from);
35 + ret = file_remove_privs(iocb->ki_filp);
38 + ret = file_update_time(iocb->ki_filp);
42 + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
44 + downgrade_write(&inode->i_rwsem);
46 + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
49 + inode_unlock(inode);
51 + inode_unlock_shared(inode);
53 + ret = generic_write_sync(iocb, ret);
59 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
61 @@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
67 + return ext4_dax_write_iter(iocb, from);
71 ret = ext4_write_checks(iocb, from);
73 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
74 index df017ce3e52d..a7079cab645a 100644
77 @@ -3321,18 +3321,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
78 struct ext4_map_blocks map;
81 - if (flags & IOMAP_WRITE)
84 if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
87 map.m_lblk = first_block;
88 map.m_len = last_block - first_block + 1;
90 - ret = ext4_map_blocks(NULL, inode, &map, 0);
93 + if (!(flags & IOMAP_WRITE)) {
94 + ret = ext4_map_blocks(NULL, inode, &map, 0);
100 + /* Trim mapping request to maximum we can map at once for DIO */
101 + if (map.m_len > DIO_MAX_BLOCKS)
102 + map.m_len = DIO_MAX_BLOCKS;
103 + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
106 + * Either we allocate blocks and then we don't get unwritten
107 + * extent so we have reserved enough credits, or the blocks
108 + * are already allocated and unwritten and in that case
109 + * extent conversion fits in the credits as well.
111 + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
113 + if (IS_ERR(handle))
114 + return PTR_ERR(handle);
116 + ret = ext4_map_blocks(handle, inode, &map,
117 + EXT4_GET_BLOCKS_PRE_IO |
118 + EXT4_GET_BLOCKS_CREATE_ZERO);
120 + ext4_journal_stop(handle);
121 + if (ret == -ENOSPC &&
122 + ext4_should_retry_alloc(inode->i_sb, &retries))
126 + /* For DAX writes we need to zero out unwritten extents */
127 + if (map.m_flags & EXT4_MAP_UNWRITTEN) {
129 + * We are protected by i_mmap_sem or i_rwsem so we know
130 + * block cannot go away from under us even though we
131 + * dropped i_data_sem. Convert extent to written and
132 + * write zeros there.
134 + ret = ext4_map_blocks(handle, inode, &map,
135 + EXT4_GET_BLOCKS_CONVERT |
136 + EXT4_GET_BLOCKS_CREATE_ZERO);
138 + ext4_journal_stop(handle);
144 + * If we added blocks beyond i_size we need to make sure they
145 + * will get truncated if we crash before updating i_size in
146 + * ext4_iomap_end().
148 + if (first_block + map.m_len >
149 + (inode->i_size + (1 << blkbits) - 1) >> blkbits) {
152 + err = ext4_orphan_add(handle, inode);
154 + ext4_journal_stop(handle);
158 + ext4_journal_stop(handle);
162 iomap->bdev = inode->i_sb->s_bdev;
163 @@ -3360,8 +3421,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
167 +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
168 + ssize_t written, unsigned flags, struct iomap *iomap)
172 + int blkbits = inode->i_blkbits;
173 + bool truncate = false;
175 + if (!(flags & IOMAP_WRITE))
178 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
179 + if (IS_ERR(handle)) {
180 + ret = PTR_ERR(handle);
183 + if (ext4_update_inode_size(inode, offset + written))
184 + ext4_mark_inode_dirty(handle, inode);
186 + * We may need to truncate allocated but not written blocks beyond EOF.
188 + if (iomap->offset + iomap->length >
189 + ALIGN(inode->i_size, 1 << blkbits)) {
190 + ext4_lblk_t written_blk, end_blk;
192 + written_blk = (offset + written) >> blkbits;
193 + end_blk = (offset + length) >> blkbits;
194 + if (written_blk < end_blk && ext4_can_truncate(inode))
198 + * Remove inode from orphan list if we were extending a inode and
199 + * everything went fine.
201 + if (!truncate && inode->i_nlink &&
202 + !list_empty(&EXT4_I(inode)->i_orphan))
203 + ext4_orphan_del(handle, inode);
204 + ext4_journal_stop(handle);
206 + ext4_truncate_failed_write(inode);
209 + * If truncate failed early the inode might still be on the
210 + * orphan list; we need to make sure the inode is removed from
211 + * the orphan list in that case.
213 + if (inode->i_nlink)
214 + ext4_orphan_del(NULL, inode);
219 struct iomap_ops ext4_iomap_ops = {
220 .iomap_begin = ext4_iomap_begin,
221 + .iomap_end = ext4_iomap_end,