Check in Jan Kara's v3 DAX iomap patches
[ext4-patch-queue.git] / DAX-iomap-write-support
blobcdb2404be5876fbdb4301fd3b90b93d7739759d9
1 ext4: DAX iomap write support
3 From: Jan Kara <jack@suse.cz>
5 Implement DAX writes using the new iomap infrastructure instead of
6 overloading the direct IO path.
8 Signed-off-by: Jan Kara <jack@suse.cz>
9 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
10 ---
11  fs/ext4/file.c  |  40 ++++++++++++++++++
12  fs/ext4/inode.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
13  2 files changed, 160 insertions(+), 6 deletions(-)
15 diff --git a/fs/ext4/file.c b/fs/ext4/file.c
16 index 1f25c644cb12..1953fe34f9fe 100644
17 --- a/fs/ext4/file.c
18 +++ b/fs/ext4/file.c
19 @@ -169,6 +169,41 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
20         return iov_iter_count(from);
21  }
23 +#ifdef CONFIG_FS_DAX
24 +static ssize_t
25 +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
27 +       struct inode *inode = file_inode(iocb->ki_filp);
28 +       ssize_t ret;
29 +       bool overwrite = false;
31 +       inode_lock(inode);
32 +       ret = ext4_write_checks(iocb, from);
33 +       if (ret <= 0)
34 +               goto out;
35 +       ret = file_remove_privs(iocb->ki_filp);
36 +       if (ret)
37 +               goto out;
38 +       ret = file_update_time(iocb->ki_filp);
39 +       if (ret)
40 +               goto out;
42 +       if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
43 +               overwrite = true;
44 +               downgrade_write(&inode->i_rwsem);
45 +       }
46 +       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
47 +out:
48 +       if (!overwrite)
49 +               inode_unlock(inode);
50 +       else
51 +               inode_unlock_shared(inode);
52 +       if (ret > 0)
53 +               ret = generic_write_sync(iocb, ret);
54 +       return ret;
56 +#endif
58  static ssize_t
59  ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
60  {
61 @@ -178,6 +213,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
62         int overwrite = 0;
63         ssize_t ret;
65 +#ifdef CONFIG_FS_DAX
66 +       if (IS_DAX(inode))
67 +               return ext4_dax_write_iter(iocb, from);
68 +#endif
70         inode_lock(inode);
71         ret = ext4_write_checks(iocb, from);
72         if (ret <= 0)
73 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
74 index df017ce3e52d..a7079cab645a 100644
75 --- a/fs/ext4/inode.c
76 +++ b/fs/ext4/inode.c
77 @@ -3321,18 +3321,79 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
78         struct ext4_map_blocks map;
79         int ret;
81 -       if (flags & IOMAP_WRITE)
82 -               return -EIO;
84         if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
85                 return -ERANGE;
87         map.m_lblk = first_block;
88         map.m_len = last_block - first_block + 1;
90 -       ret = ext4_map_blocks(NULL, inode, &map, 0);
91 -       if (ret < 0)
92 -               return ret;
93 +       if (!(flags & IOMAP_WRITE)) {
94 +               ret = ext4_map_blocks(NULL, inode, &map, 0);
95 +       } else {
96 +               int dio_credits;
97 +               handle_t *handle;
98 +               int retries = 0;
100 +               /* Trim mapping request to maximum we can map at once for DIO */
101 +               if (map.m_len > DIO_MAX_BLOCKS)
102 +                       map.m_len = DIO_MAX_BLOCKS;
103 +               dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
104 +retry:
105 +               /*
106 +                * Either we allocate blocks and then we don't get unwritten
107 +                * extent so we have reserved enough credits, or the blocks
108 +                * are already allocated and unwritten and in that case
109 +                * extent conversion fits in the credits as well.
110 +                */
111 +               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
112 +                                           dio_credits);
113 +               if (IS_ERR(handle))
114 +                       return PTR_ERR(handle);
116 +               ret = ext4_map_blocks(handle, inode, &map,
117 +                                     EXT4_GET_BLOCKS_PRE_IO |
118 +                                     EXT4_GET_BLOCKS_CREATE_ZERO);
119 +               if (ret < 0) {
120 +                       ext4_journal_stop(handle);
121 +                       if (ret == -ENOSPC &&
122 +                           ext4_should_retry_alloc(inode->i_sb, &retries))
123 +                               goto retry;
124 +                       return ret;
125 +               }
126 +               /* For DAX writes we need to zero out unwritten extents */
127 +               if (map.m_flags & EXT4_MAP_UNWRITTEN) {
128 +                       /*
129 +                        * We are protected by i_mmap_sem or i_rwsem so we know
130 +                        * block cannot go away from under us even though we
131 +                        * dropped i_data_sem. Convert extent to written and
132 +                        * write zeros there.
133 +                        */
134 +                       ret = ext4_map_blocks(handle, inode, &map,
135 +                                             EXT4_GET_BLOCKS_CONVERT |
136 +                                             EXT4_GET_BLOCKS_CREATE_ZERO);
137 +                       if (ret < 0) {
138 +                               ext4_journal_stop(handle);
139 +                               return ret;
140 +                       }
141 +               }
143 +               /*
144 +                * If we added blocks beyond i_size we need to make sure they
145 +                * will get truncated if we crash before updating i_size in
146 +                * ext4_iomap_end().
147 +                */
148 +               if (first_block + map.m_len >
149 +                   (inode->i_size + (1 << blkbits) - 1) >> blkbits) {
150 +                       int err;
152 +                       err = ext4_orphan_add(handle, inode);
153 +                       if (err < 0) {
154 +                               ext4_journal_stop(handle);
155 +                               return err;
156 +                       }
157 +               }
158 +               ext4_journal_stop(handle);
159 +       }
161         iomap->flags = 0;
162         iomap->bdev = inode->i_sb->s_bdev;
163 @@ -3360,8 +3421,61 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
164         return 0;
167 +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
168 +                         ssize_t written, unsigned flags, struct iomap *iomap)
170 +       int ret = 0;
171 +       handle_t *handle;
172 +       int blkbits = inode->i_blkbits;
173 +       bool truncate = false;
175 +       if (!(flags & IOMAP_WRITE))
176 +               return 0;
178 +       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
179 +       if (IS_ERR(handle)) {
180 +               ret = PTR_ERR(handle);
181 +               goto orphan_del;
182 +       }
183 +       if (ext4_update_inode_size(inode, offset + written))
184 +               ext4_mark_inode_dirty(handle, inode);
185 +       /*
186 +        * We may need to truncate allocated but not written blocks beyond EOF.
187 +        */
188 +       if (iomap->offset + iomap->length > 
189 +           ALIGN(inode->i_size, 1 << blkbits)) {
190 +               ext4_lblk_t written_blk, end_blk;
192 +               written_blk = (offset + written) >> blkbits;
193 +               end_blk = (offset + length) >> blkbits;
194 +               if (written_blk < end_blk && ext4_can_truncate(inode))
195 +                       truncate = true;
196 +       }
197 +       /*
198 +        * Remove inode from orphan list if we were extending a inode and
199 +        * everything went fine.
200 +        */
201 +       if (!truncate && inode->i_nlink &&
202 +           !list_empty(&EXT4_I(inode)->i_orphan))
203 +               ext4_orphan_del(handle, inode);
204 +       ext4_journal_stop(handle);
205 +       if (truncate) {
206 +               ext4_truncate_failed_write(inode);
207 +orphan_del:
208 +               /*
209 +                * If truncate failed early the inode might still be on the
210 +                * orphan list; we need to make sure the inode is removed from
211 +                * the orphan list in that case.
212 +                */
213 +               if (inode->i_nlink)
214 +                       ext4_orphan_del(NULL, inode);
215 +       }
216 +       return ret;
219  struct iomap_ops ext4_iomap_ops = {
220         .iomap_begin            = ext4_iomap_begin,
221 +       .iomap_end              = ext4_iomap_end,
222  };
224  #else
225 -- 
226 2.6.6