drop save-goal-location-struct-ext4_allocation_context.ac_g_ex
[ext4-patch-queue.git] / fix-punch-hole-on-files-with-indirect-mapping
blob3a57d028c52c2c1ade4786e05b297d9eca0b8748
1 ext4: fix punch hole on files with indirect mapping
3 From: Lukas Czerner <lczerner@redhat.com>
5 Currently punch hole code on files with direct/indirect mapping has some
6 problems which may lead to a data loss. For example (from Jan Kara):
8 fallocate -n -p 10240000 4096
10 will punch the range 10240000 - 12632064 instead of the range 1024000 -
11 10244096.
13 Also the code is a bit weird and it's not using infrastructure provided
14 by indirect.c, but rather creating it's own way.
16 This patch fixes the issues as well as making the operation to run 4
17 times faster from my testing (punching out 60GB file). It uses similar
18 approach used in ext4_ind_truncate() which takes advantage of
19 ext4_free_branches() function.
21 Also rename the ext4_free_hole_blocks() to something more sensible, like
22 the equivalent we have for extent mapped files. Call it
23 ext4_ind_remove_space().
25 This has been tested mostly with fsx and some xfstests which are testing
26 punch hole but does not require unwritten extents which are not
27 supported with direct/indirect mapping. Not problems showed up even with
28 1024k block size.
30 CC: stable@vger.kernel.org
31 Signed-off-by: Lukas Czerner <lczerner@redhat.com>
32 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
33 ---
34  fs/ext4/ext4.h     |   4 +-
35  fs/ext4/indirect.c | 277 +++++++++++++++++++++++++++++++++++++++--------------
36  fs/ext4/inode.c    |   2 +-
37  3 files changed, 207 insertions(+), 76 deletions(-)
39 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
40 index d35c78c..5535ed2 100644
41 --- a/fs/ext4/ext4.h
42 +++ b/fs/ext4/ext4.h
43 @@ -2143,8 +2143,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
44  extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
45  extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
46  extern void ext4_ind_truncate(handle_t *, struct inode *inode);
47 -extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
48 -                                ext4_lblk_t first, ext4_lblk_t stop);
49 +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
50 +                                ext4_lblk_t start, ext4_lblk_t end);
52  /* ioctl.c */
53  extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
54 diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
55 index fd69da1..e75f840 100644
56 --- a/fs/ext4/indirect.c
57 +++ b/fs/ext4/indirect.c
58 @@ -1295,97 +1295,220 @@ do_indirects:
59         }
60  }
62 -static int free_hole_blocks(handle_t *handle, struct inode *inode,
63 -                           struct buffer_head *parent_bh, __le32 *i_data,
64 -                           int level, ext4_lblk_t first,
65 -                           ext4_lblk_t count, int max)
66 +/**
67 + *     ext4_ind_remove_space - remove space from the range
68 + *     @handle: JBD handle for this transaction
69 + *     @inode: inode we are dealing with
70 + *     @start: First block to remove
71 + *     @end:   One block after the last block to remove (exclusive)
72 + *
73 + *     Free the blocks in the defined range (end is exclusive endpoint of
74 + *     range). This is used by ext4_punch_hole().
75 + */
76 +int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
77 +                         ext4_lblk_t start, ext4_lblk_t end)
78  {
79 -       struct buffer_head *bh = NULL;
80 +       struct ext4_inode_info *ei = EXT4_I(inode);
81 +       __le32 *i_data = ei->i_data;
82         int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
83 -       int ret = 0;
84 -       int i, inc;
85 -       ext4_lblk_t offset;
86 -       __le32 blk;
88 -       inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
89 -       for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
90 -               if (offset >= count + first)
91 -                       break;
92 -               if (*i_data == 0 || (offset + inc) <= first)
93 -                       continue;
94 -               blk = *i_data;
95 -               if (level > 0) {
96 -                       ext4_lblk_t first2;
97 -                       ext4_lblk_t count2;
98 +       ext4_lblk_t offsets[4], offsets2[4];
99 +       Indirect chain[4], chain2[4];
100 +       Indirect *partial, *partial2;
101 +       ext4_lblk_t max_block;
102 +       __le32 nr = 0, nr2 = 0;
103 +       int n = 0, n2 = 0;
104 +       unsigned blocksize = inode->i_sb->s_blocksize;
106 -                       bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
107 -                       if (!bh) {
108 -                               EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
109 -                                                      "Read failure");
110 -                               return -EIO;
111 -                       }
112 -                       if (first > offset) {
113 -                               first2 = first - offset;
114 -                               count2 = count;
115 +       max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
116 +                                       >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
117 +       if (end >= max_block)
118 +               end = max_block;
119 +       if ((start >= end) || (start > max_block))
120 +               return 0;
122 +       n = ext4_block_to_path(inode, start, offsets, NULL);
123 +       n2 = ext4_block_to_path(inode, end, offsets2, NULL);
125 +       BUG_ON(n > n2);
127 +       if ((n == 1) && (n == n2)) {
128 +               /* We're punching only within direct block range */
129 +               ext4_free_data(handle, inode, NULL, i_data + offsets[0],
130 +                              i_data + offsets2[0]);
131 +               return 0;
132 +       } else if (n2 > n) {
133 +               /*
134 +                * Start and end are on a different levels so we're going to
135 +                * free partial block at start, and partial block at end of
136 +                * the range. If there are some levels in between then
137 +                * do_indirects label will take care of that.
138 +                */
140 +               if (n == 1) {
141 +                       /*
142 +                        * Start is at the direct block level, free
143 +                        * everything to the end of the level.
144 +                        */
145 +                       ext4_free_data(handle, inode, NULL, i_data + offsets[0],
146 +                                      i_data + EXT4_NDIR_BLOCKS);
147 +                       goto end_range;
148 +               }
151 +               partial = ext4_find_shared(inode, n, offsets, chain, &nr);
152 +               if (nr) {
153 +                       if (partial == chain) {
154 +                               /* Shared branch grows from the inode */
155 +                               ext4_free_branches(handle, inode, NULL,
156 +                                          &nr, &nr+1, (chain+n-1) - partial);
157 +                               *partial->p = 0;
158                         } else {
159 -                               first2 = 0;
160 -                               count2 = count - (offset - first);
161 +                               /* Shared branch grows from an indirect block */
162 +                               BUFFER_TRACE(partial->bh, "get_write_access");
163 +                               ext4_free_branches(handle, inode, partial->bh,
164 +                                       partial->p,
165 +                                       partial->p+1, (chain+n-1) - partial);
166                         }
167 -                       ret = free_hole_blocks(handle, inode, bh,
168 -                                              (__le32 *)bh->b_data, level - 1,
169 -                                              first2, count2,
170 -                                              inode->i_sb->s_blocksize >> 2);
171 -                       if (ret) {
172 -                               brelse(bh);
173 -                               goto err;
174 +               }
176 +               /*
177 +                * Clear the ends of indirect blocks on the shared branch
178 +                * at the start of the range
179 +                */
180 +               while (partial > chain) {
181 +                       ext4_free_branches(handle, inode, partial->bh,
182 +                               partial->p + 1,
183 +                               (__le32 *)partial->bh->b_data+addr_per_block,
184 +                               (chain+n-1) - partial);
185 +                       BUFFER_TRACE(partial->bh, "call brelse");
186 +                       brelse(partial->bh);
187 +                       partial--;
188 +               }
190 +end_range:
191 +               partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
192 +               if (nr2) {
193 +                       if (partial2 == chain2) {
194 +                               /*
195 +                                * Remember, end is exclusive so here we're at
196 +                                * the start of the next level we're not going
197 +                                * to free. Everything was covered by the start
198 +                                * of the range.
199 +                                */
200 +                               return 0;
201 +                       } else {
202 +                               /* Shared branch grows from an indirect block */
203 +                               partial2--;
204                         }
205 +               } else {
206 +                       /*
207 +                        * ext4_find_shared returns Indirect structure which
208 +                        * points to the last element which should not be
209 +                        * removed by truncate. But this is end of the range
210 +                        * in punch_hole so we need to point to the next element
211 +                        */
212 +                       partial2->p++;
213                 }
214 -               if (level == 0 ||
215 -                   (bh && all_zeroes((__le32 *)bh->b_data,
216 -                                     (__le32 *)bh->b_data + addr_per_block))) {
217 -                       ext4_free_data(handle, inode, parent_bh,
218 -                                      i_data, i_data + 1);
220 +               /*
221 +                * Clear the ends of indirect blocks on the shared branch
222 +                * at the end of the range
223 +                */
224 +               while (partial2 > chain2) {
225 +                       ext4_free_branches(handle, inode, partial2->bh,
226 +                                          (__le32 *)partial2->bh->b_data,
227 +                                          partial2->p,
228 +                                          (chain2+n2-1) - partial2);
229 +                       BUFFER_TRACE(partial2->bh, "call brelse");
230 +                       brelse(partial2->bh);
231 +                       partial2--;
232                 }
233 -               brelse(bh);
234 -               bh = NULL;
235 +               goto do_indirects;
236         }
238 -err:
239 -       return ret;
242 -int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
243 -                         ext4_lblk_t first, ext4_lblk_t stop)
245 -       int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
246 -       int level, ret = 0;
247 -       int num = EXT4_NDIR_BLOCKS;
248 -       ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
249 -       __le32 *i_data = EXT4_I(inode)->i_data;
251 -       count = stop - first;
252 -       for (level = 0; level < 4; level++, max *= addr_per_block) {
253 -               if (first < max) {
254 -                       ret = free_hole_blocks(handle, inode, NULL, i_data,
255 -                                              level, first, count, num);
256 -                       if (ret)
257 -                               goto err;
258 -                       if (count > max - first)
259 -                               count -= max - first;
260 -                       else
261 -                               break;
262 -                       first = 0;
263 -               } else {
264 -                       first -= max;
265 +       /* Punch happened within the same level (n == n2) */
266 +       partial = ext4_find_shared(inode, n, offsets, chain, &nr);
267 +       partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
268 +       /*
269 +        * ext4_find_shared returns Indirect structure which
270 +        * points to the last element which should not be
271 +        * removed by truncate. But this is end of the range
272 +        * in punch_hole so we need to point to the next element
273 +        */
274 +       partial2->p++;
275 +       while ((partial > chain) || (partial2 > chain2)) {
276 +               /* We're at the same block, so we're almost finished */
277 +               if ((partial->bh && partial2->bh) &&
278 +                   (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
279 +                       if ((partial > chain) && (partial2 > chain2)) {
280 +                               ext4_free_branches(handle, inode, partial->bh,
281 +                                                  partial->p + 1,
282 +                                                  partial2->p,
283 +                                                  (chain+n-1) - partial);
284 +                               BUFFER_TRACE(partial->bh, "call brelse");
285 +                               brelse(partial->bh);
286 +                               BUFFER_TRACE(partial2->bh, "call brelse");
287 +                               brelse(partial2->bh);
288 +                       }
289 +                       return 0;
290                 }
291 -               i_data += num;
292 -               if (level == 0) {
293 -                       num = 1;
294 -                       max = 1;
295 +               /*
296 +                * Clear the ends of indirect blocks on the shared branch
297 +                * at the start of the range
298 +                */
299 +               if (partial > chain) {
300 +                       ext4_free_branches(handle, inode, partial->bh,
301 +                                  partial->p + 1,
302 +                                  (__le32 *)partial->bh->b_data+addr_per_block,
303 +                                  (chain+n-1) - partial);
304 +                       BUFFER_TRACE(partial->bh, "call brelse");
305 +                       brelse(partial->bh);
306 +                       partial--;
307 +               }
308 +               /*
309 +                * Clear the ends of indirect blocks on the shared branch
310 +                * at the end of the range
311 +                */
312 +               if (partial2 > chain2) {
313 +                       ext4_free_branches(handle, inode, partial2->bh,
314 +                                          (__le32 *)partial2->bh->b_data,
315 +                                          partial2->p,
316 +                                          (chain2+n-1) - partial2);
317 +                       BUFFER_TRACE(partial2->bh, "call brelse");
318 +                       brelse(partial2->bh);
319 +                       partial2--;
320                 }
321         }
323 -err:
324 -       return ret;
325 +do_indirects:
326 +       /* Kill the remaining (whole) subtrees */
327 +       switch (offsets[0]) {
328 +       default:
329 +               if (++n >= n2)
330 +                       return 0;
331 +               nr = i_data[EXT4_IND_BLOCK];
332 +               if (nr) {
333 +                       ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
334 +                       i_data[EXT4_IND_BLOCK] = 0;
335 +               }
336 +       case EXT4_IND_BLOCK:
337 +               if (++n >= n2)
338 +                       return 0;
339 +               nr = i_data[EXT4_DIND_BLOCK];
340 +               if (nr) {
341 +                       ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
342 +                       i_data[EXT4_DIND_BLOCK] = 0;
343 +               }
344 +       case EXT4_DIND_BLOCK:
345 +               if (++n >= n2)
346 +                       return 0;
347 +               nr = i_data[EXT4_TIND_BLOCK];
348 +               if (nr) {
349 +                       ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
350 +                       i_data[EXT4_TIND_BLOCK] = 0;
351 +               }
352 +       case EXT4_TIND_BLOCK:
353 +               ;
354 +       }
355 +       return 0;
358 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
359 index 027ee8c..367a60c 100644
360 --- a/fs/ext4/inode.c
361 +++ b/fs/ext4/inode.c
362 @@ -3506,7 +3506,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
363                 ret = ext4_ext_remove_space(inode, first_block,
364                                             stop_block - 1);
365         else
366 -               ret = ext4_free_hole_blocks(handle, inode, first_block,
367 +               ret = ext4_ind_remove_space(handle, inode, first_block,
368                                             stop_block);
370         up_write(&EXT4_I(inode)->i_data_sem);