Add patch SR-ext4-resize-mark-new-group-EXT_BG_INODE_ZEROED.patch
[ext4-patch-queue/an.git] / defrag-03-exchange-extents
blob6379fd0826af16356b9f65f1b91e044712c2fdbf
1 ext4: online defrag -- Exchange the extents between two inodes
3 From: Akira Fujita <a-fujita@rs.jp.nec.com>
5 For each page, exchange the extents between the temporary inode
6 and the original inode, and then write them.
8 Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
9 Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
10 ---
11  fs/ext4/defrag.c |  477 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
12  1 files changed, 476 insertions(+), 1 deletions(-)
14 diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
15 index 729f001..0b90d4d 100644
16 --- a/fs/ext4/defrag.c
17 +++ b/fs/ext4/defrag.c
18 @@ -91,6 +91,361 @@ err:
19  }
21  /**
22 + * ext4_defrag_merge_extents - Merge new extent
23 + *
24 + * @handle:    journal handle
25 + * @org_inode: original inode
26 + * @org_path:  path indicates first extent to be defraged
27 + * @o_start:   first original extent to be defraged
28 + * @o_end:     last original extent to be defraged
29 + * @start_ext: first new extent to be merged
30 + * @new_ext:   middle of new extent to be merged
31 + * @end_ext:   last new extent to be merged
32 + * @replaced:  the number of blocks which will be replaced with new_ext
33 + *
34 + * This function returns 0 if succeed, otherwise returns error value.
35 + */
36 +static int
37 +ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
38 +               struct ext4_ext_path *org_path,
39 +               struct ext4_extent *o_start, struct ext4_extent *o_end,
40 +               struct ext4_extent *start_ext, struct ext4_extent *new_ext,
41 +               struct ext4_extent *end_ext, ext4_fsblk_t replaced)
43 +       return 0;
46 +/**
47 + * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
48 + *
49 + * @handle:            journal handle
50 + * @org_inode:         original inode
51 + * @org_path:          path indicates first extent to be defraged
52 + * @dext:              destination extent
53 + * @from:              start offset on the target file
54 + *
55 + * This function returns 0 if succeed, otherwise returns error value.
56 + */
57 +static int
58 +ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
59 +               struct ext4_ext_path *org_path, struct ext4_extent *dext,
60 +               ext4_lblk_t *from)
62 +       struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
63 +       struct ext4_extent new_ext, start_ext, end_ext;
64 +       ext4_fsblk_t replaced = 0;
65 +       ext4_lblk_t new_end, lblock;
66 +       unsigned long depth;
67 +       unsigned short len;
68 +       ext4_fsblk_t new_phys_end;
69 +       int     ret;
71 +       depth = ext_depth(org_inode);
72 +       start_ext.ee_len = end_ext.ee_len = 0;
73 +       o_start = o_end = oext = org_path[depth].p_ext;
74 +       ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
75 +       new_ext.ee_len = dext->ee_len;
76 +       len = le16_to_cpu(new_ext.ee_len);
77 +       new_ext.ee_block = cpu_to_le32(*from);
78 +       lblock = le32_to_cpu(oext->ee_block);
79 +       new_end = le32_to_cpu(new_ext.ee_block)
80 +               + le16_to_cpu(new_ext.ee_len) - 1;
81 +       new_phys_end = ext_pblock(&new_ext)
82 +               + le16_to_cpu(new_ext.ee_len) - 1;
84 +       /*
85 +        * First original extent
86 +        * dest  |---------------|
87 +        * org  |---------------|
88 +        */
89 +       if (le32_to_cpu(new_ext.ee_block) >
90 +               le32_to_cpu(oext->ee_block) &&
91 +               le32_to_cpu(new_ext.ee_block) <
92 +               le32_to_cpu(oext->ee_block)
93 +               + le16_to_cpu(oext->ee_len)) {
94 +               start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
95 +                                       - le32_to_cpu(oext->ee_block));
96 +               replaced += le16_to_cpu(oext->ee_len)
97 +                                       - le16_to_cpu(start_ext.ee_len);
98 +       } else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
99 +               /* We can merge previous extent. */
100 +               prev_ext = oext - 1;
101 +               if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
102 +                                == ext_pblock(&new_ext))
103 +                && (le32_to_cpu(prev_ext->ee_block)
104 +                       + le16_to_cpu(prev_ext->ee_len)
105 +                                == le32_to_cpu(new_ext.ee_block))) {
106 +                       o_start = prev_ext;
107 +                       start_ext.ee_len = cpu_to_le16(
108 +                                       le16_to_cpu(prev_ext->ee_len)
109 +                                       + le16_to_cpu(new_ext.ee_len));
110 +                       new_ext.ee_len = 0;
111 +               }
112 +       }
114 +       for (;;) {
115 +               /* The extent for destination must be found. */
116 +               BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
117 +               lblock += le16_to_cpu(oext->ee_len);
119 +               /*
120 +                * Middle of original extent
121 +                * dest |-------------------|
122 +                * org   |-----------------|
123 +                */
124 +               if (le32_to_cpu(new_ext.ee_block) <=
125 +                       le32_to_cpu(oext->ee_block) &&
126 +                       new_end >= le32_to_cpu(oext->ee_block)
127 +                       + le16_to_cpu(oext->ee_len) - 1)
128 +                       replaced += le16_to_cpu(oext->ee_len);
130 +               /*
131 +                * Last original extent
132 +                * dest |----------------|
133 +                * org    |---------------|
134 +                */
135 +               if (new_end >= le32_to_cpu(oext->ee_block) &&
136 +                       new_end < le32_to_cpu(oext->ee_block)
137 +                               + le16_to_cpu(oext->ee_len) - 1) {
138 +                       end_ext.ee_len
139 +                               = cpu_to_le16(le32_to_cpu(oext->ee_block)
140 +                               + le16_to_cpu(oext->ee_len) - 1 - new_end);
141 +                       ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
142 +                               + le16_to_cpu(oext->ee_len)
143 +                               - le16_to_cpu(end_ext.ee_len)));
144 +                       end_ext.ee_block
145 +                               = cpu_to_le32(le32_to_cpu(o_end->ee_block)
146 +                               + le16_to_cpu(oext->ee_len)
147 +                               - le16_to_cpu(end_ext.ee_len));
148 +                       replaced += le16_to_cpu(oext->ee_len)
149 +                               - le16_to_cpu(end_ext.ee_len);
150 +               }
152 +               /*
153 +                * Detected the block end, reached the number of replaced
154 +                * blocks to dext->ee_len. Then merge the extent.
155 +                */
156 +               if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
157 +                       new_end <= le32_to_cpu(oext->ee_block)
158 +                               + le16_to_cpu(oext->ee_len) - 1) {
159 +                       ret = ext4_defrag_merge_extents(handle, org_inode,
160 +                                       org_path, o_start, o_end, &start_ext,
161 +                                       &new_ext, &end_ext, replaced);
162 +                       if (ret < 0)
163 +                               return ret;
165 +                       /* All expected blocks are replaced */
166 +                       if (le16_to_cpu(new_ext.ee_len) <= 0)
167 +                               return 0;
169 +                       /* Re-calculate new_ext */
170 +                       le16_add_cpu(&new_ext.ee_len, -replaced);
171 +                       le32_add_cpu(&new_ext.ee_block, replaced);
172 +                       ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
173 +                                       + replaced);
174 +                       replaced = 0;
175 +                       start_ext.ee_len = end_ext.ee_len = 0;
176 +                       o_start = NULL;
178 +                       /* All expected blocks are replaced. */
179 +                       if (le16_to_cpu(new_ext.ee_len) <= 0)
180 +                               return 0;
181 +               }
183 +               /* Get the next extent for original. */
184 +               if (org_path)
185 +                       ext4_ext_drop_refs(org_path);
186 +               org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
187 +               if (IS_ERR(org_path)) {
188 +                       ret = PTR_ERR(org_path);
189 +                       org_path = NULL;
190 +                       return ret;
191 +               }
192 +               depth = ext_depth(org_inode);
193 +               oext = org_path[depth].p_ext;
194 +               if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
195 +                       <= lblock)
196 +                       return -ENOENT;
198 +               o_end = oext;
199 +               if (!o_start)
200 +                       o_start = oext;
201 +       }
204 +/**
205 + * ext4_defrag_replace_branches - Replace original extents with new extents
206 + *
207 + * @handle:            journal handle
208 + * @org_inode:         original inode
209 + * @dest_inode:                temporary inode
210 + * @from:              block offset of org_inode
211 + * @dest_off:          block offset of dest_inode
212 + * @count:             block count to be replaced
213 + *
214 + * This function returns 0 if succeed, otherwise returns error value.
215 + * Replace extents for blocks from "from" to "from + count - 1".
216 + */
217 +static int
218 +ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
219 +                       struct inode *dest_inode, ext4_lblk_t from,
220 +                       ext4_lblk_t dest_off, ext4_lblk_t count)
222 +       struct ext4_ext_path *org_path = NULL;
223 +       struct ext4_ext_path *dest_path = NULL;
224 +       struct ext4_extent *oext, *dext, *swap_ext;
225 +       struct ext4_extent tmp_ext, tmp_ext2;
226 +       ext4_lblk_t diff, org_diff;
227 +       int err = 0;
228 +       int depth;
229 +       int replaced_count = 0;
231 +       /* Get the original extent for the block "from" */
232 +       org_path = ext4_ext_find_extent(org_inode, from, NULL);
233 +       if (IS_ERR(org_path)) {
234 +               err = PTR_ERR(org_path);
235 +               org_path = NULL;
236 +               goto out;
237 +       }
239 +       /* Get the destination extent for the head */
240 +       dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
241 +       if (IS_ERR(dest_path)) {
242 +               err = PTR_ERR(dest_path);
243 +               dest_path = NULL;
244 +               goto out;
245 +       }
246 +       depth = ext_depth(dest_inode);
247 +       dext = dest_path[depth].p_ext;
248 +       /* When dext is too large, pick up the target range. */
249 +       diff = dest_off - le32_to_cpu(dext->ee_block);
250 +       ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
251 +       tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
252 +       tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
253 +       if (count < le16_to_cpu(tmp_ext.ee_len))
254 +               tmp_ext.ee_len = cpu_to_le16(count);
255 +       dext = &tmp_ext;
257 +       depth = ext_depth(org_inode);
258 +       oext = org_path[depth].p_ext;
259 +       org_diff = from - le32_to_cpu(oext->ee_block);
260 +       ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
261 +       tmp_ext2.ee_block = tmp_ext.ee_block;
263 +       /* Adjust extent length when blocksize != pagesize */
264 +       if (le16_to_cpu(tmp_ext.ee_len) <=
265 +               le16_to_cpu(oext->ee_len) - org_diff) {
266 +               tmp_ext2.ee_len = tmp_ext.ee_len;
267 +       } else {
268 +               tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
269 +                                               - org_diff);
270 +               tmp_ext.ee_len = tmp_ext2.ee_len;
271 +       }
272 +       swap_ext = &tmp_ext2;
274 +       /* Loop for the destination extents */
275 +       while (1) {
276 +               /* The extent for destination must be found. */
277 +               BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
279 +               /* Loop for the original extent blocks */
280 +               err = ext4_defrag_leaf_block(handle, org_inode,
281 +                                               org_path, dext, &from);
282 +               if (err < 0)
283 +                       goto out;
285 +               /*
286 +                * We need the function which fixes extent information for
287 +                * inserting.
288 +                * e.g. ext4_defrag_merge_extents()
289 +                */
290 +               err = ext4_defrag_leaf_block(handle, dest_inode,
291 +                                       dest_path, swap_ext, &dest_off);
292 +               if (err < 0)
293 +                       goto out;
295 +               replaced_count += le16_to_cpu(dext->ee_len);
296 +               dest_off += le16_to_cpu(dext->ee_len);
297 +               from += le16_to_cpu(dext->ee_len);
299 +               /* Already moved the expected blocks */
300 +               if (replaced_count >= count)
301 +                       break;
303 +               if (org_path)
304 +                       ext4_ext_drop_refs(org_path);
305 +               org_path = ext4_ext_find_extent(org_inode, from, NULL);
306 +               if (IS_ERR(org_path)) {
307 +                       err = PTR_ERR(org_path);
308 +                       org_path = NULL;
309 +                       goto out;
310 +               }
311 +               depth = ext_depth(org_inode);
312 +               oext = org_path[depth].p_ext;
313 +               if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
314 +                       <= from) {
315 +                       err = 0;
316 +                       goto out;
317 +               }
319 +               if (dest_path)
320 +                       ext4_ext_drop_refs(dest_path);
321 +               dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
322 +               if (IS_ERR(dest_path)) {
323 +                       err = PTR_ERR(dest_path);
324 +                       dest_path = NULL;
325 +                       goto out;
326 +               }
327 +               depth = ext_depth(dest_inode);
328 +               dext = dest_path[depth].p_ext;
329 +               if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
330 +                       <= dest_off) {
331 +                       err = 0;
332 +                       goto out;
333 +               }
335 +               /* When dext is too large, pick up the target range. */
336 +               diff = dest_off - le32_to_cpu(dext->ee_block);
337 +               ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
338 +               tmp_ext.ee_block =
339 +                       cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
340 +               tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
342 +               if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
343 +                       tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
345 +               dext = &tmp_ext;
347 +               org_diff = from - le32_to_cpu(oext->ee_block);
348 +               ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
349 +               tmp_ext2.ee_block = tmp_ext.ee_block;
351 +               /* Adjust extent length when blocksize != pagesize */
352 +               if (le16_to_cpu(tmp_ext.ee_len) <=
353 +                       le16_to_cpu(oext->ee_len) - org_diff) {
354 +                       tmp_ext2.ee_len = tmp_ext.ee_len;
355 +               } else {
356 +                       tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
357 +                                                       - org_diff);
358 +                       tmp_ext.ee_len = tmp_ext2.ee_len;
359 +               }
360 +               swap_ext = &tmp_ext2;
361 +       }
363 +out:
364 +       if (org_path) {
365 +               ext4_ext_drop_refs(org_path);
366 +               kfree(org_path);
367 +       }
368 +       if (dest_path) {
369 +               ext4_ext_drop_refs(dest_path);
370 +               kfree(dest_path);
371 +       }
373 +       return err;
376 +/**
377   * ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
378   *
379   * @org_inode:         original inode
380 @@ -185,7 +540,127 @@ ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
381                         pgoff_t org_page_offset, ext4_lblk_t dest_blk_offset,
382                         int data_offset_in_page, int block_len_in_page)
384 -       return 0;
385 +       struct inode *org_inode = filp->f_dentry->d_inode;
386 +       struct address_space *mapping = org_inode->i_mapping;
387 +       struct buffer_head *bh;
388 +       struct page *page = NULL;
389 +       const struct address_space_operations *a_ops = mapping->a_ops;
390 +       handle_t *handle;
391 +       ext4_lblk_t org_blk_offset;
392 +       long long offs = org_page_offset << PAGE_CACHE_SHIFT;
393 +       unsigned long blocksize = org_inode->i_sb->s_blocksize;
394 +       unsigned int w_flags = 0;
395 +       unsigned int tmp_data_len;
396 +       unsigned data_len;
397 +       void *fsdata;
398 +       int ret, i, jblocks;
399 +       int blocks_per_page = PAGE_CACHE_SIZE >> org_inode->i_blkbits;
401 +       /*
402 +        * It needs twice the amount of ordinary journal buffers because
403 +        * inode and tmp_inode may change each different metadata blocks.
404 +        */
405 +       jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
406 +       handle = ext4_journal_start(org_inode, jblocks);
407 +       if (IS_ERR(handle)) {
408 +               ret = PTR_ERR(handle);
409 +               return ret;
410 +       }
412 +       if (segment_eq(get_fs(), KERNEL_DS))
413 +               w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
415 +       org_blk_offset = org_page_offset * blocks_per_page +
416 +                                                       data_offset_in_page;
417 +       offs = (long long)org_blk_offset << org_inode->i_blkbits;
419 +       /* Calculate data_len */
420 +       if ((org_blk_offset + block_len_in_page - 1) ==
421 +                       ((org_inode->i_size - 1) >> org_inode->i_blkbits)) {
422 +               /* the case which we replace the last block */
423 +               tmp_data_len = org_inode->i_size & (blocksize - 1);
424 +               /*
425 +                * If data_len equal zero, it shows data_len is multiples of
426 +                * blocksize. So we set appropriate value.
427 +                */
428 +               if (tmp_data_len == 0)
429 +                       tmp_data_len = blocksize;
431 +               data_len = tmp_data_len +
432 +                       ((block_len_in_page - 1) << org_inode->i_blkbits);
433 +       } else {
434 +               data_len = block_len_in_page << org_inode->i_blkbits;
435 +       }
437 +       up_write(&EXT4_I(org_inode)->i_data_sem);
438 +       ret = a_ops->write_begin(filp, mapping, offs, data_len, w_flags, &page,
439 +                                                               &fsdata);
440 +       down_write(&EXT4_I(org_inode)->i_data_sem);
442 +       if (unlikely(ret < 0))
443 +               goto out;
445 +       if (!PageUptodate(page)) {
446 +               up_write(&EXT4_I(org_inode)->i_data_sem);
447 +               mapping->a_ops->readpage(filp, page);
448 +               down_write(&EXT4_I(org_inode)->i_data_sem);
449 +               lock_page(page);
450 +       }
452 +       /*
453 +        * try_to_release_page() doesn't call releasepage in writeback mode.
454 +        * We should care about the order of writing to the same file
455 +        * by multiple defrag processes.
456 +        * It needs to call wait_on_page_writeback() to wait for the
457 +        * writeback of the page.
458 +        */
459 +       if (PageWriteback(page))
460 +               wait_on_page_writeback(page);
462 +       /* Release old bh and drop refs */
463 +       try_to_release_page(page, 0);
464 +       ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
465 +                                               org_blk_offset, dest_blk_offset,
466 +                                               block_len_in_page);
467 +       if (ret < 0)
468 +               goto out;
470 +       /* Clear the inode cache not to refer to the old data */
471 +       ext4_ext_invalidate_cache(org_inode);
473 +       if (!page_has_buffers(page))
474 +               create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
476 +       bh = page_buffers(page);
477 +       for (i = 0; i < data_offset_in_page; i++)
478 +               bh = bh->b_this_page;
480 +       for (i = 0; i < block_len_in_page; i++) {
481 +               up_write(&EXT4_I(org_inode)->i_data_sem);
482 +               ret = ext4_get_block(org_inode, (sector_t)(org_blk_offset + i),
483 +                                                                       bh, 0);
484 +               down_write(&EXT4_I(org_inode)->i_data_sem);
486 +               if (ret < 0)
487 +                       goto out;
489 +               if (bh->b_this_page != NULL)
490 +                       bh = bh->b_this_page;
491 +       }
493 +       ret = a_ops->write_end(filp, mapping, offs, data_len, data_len, page,
494 +                                                                       fsdata);
495 +       page = NULL;
497 +out:
498 +       if (unlikely(page)) {
499 +               if (PageLocked(page))
500 +                       unlock_page(page);
501 +               page_cache_release(page);
502 +       }
503 +       ext4_journal_stop(handle);
505 +       return ret < 0 ? ret : 0;
508  /**