1 ext4: online defrag -- Exchange the extents between two inodes
3 From: Akira Fujita <a-fujita@rs.jp.nec.com>
5 For each page, exchange the extents between the temporary inode
6 and the original inode, and then write them.
8 Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
9 Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
11 fs/ext4/defrag.c | 477 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
12 1 files changed, 476 insertions(+), 1 deletions(-)
14 diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
15 index 729f001..0b90d4d 100644
16 --- a/fs/ext4/defrag.c
17 +++ b/fs/ext4/defrag.c
18 @@ -91,6 +91,361 @@ err:
22 + * ext4_defrag_merge_extents - Merge new extent
24 + * @handle: journal handle
25 + * @org_inode: original inode
26 + * @org_path: path indicates first extent to be defraged
27 + * @o_start: first original extent to be defraged
28 + * @o_end: last original extent to be defraged
29 + * @start_ext: first new extent to be merged
30 + * @new_ext: middle of new extent to be merged
31 + * @end_ext: last new extent to be merged
32 + * @replaced: the number of blocks which will be replaced with new_ext
34 + * This function returns 0 if succeed, otherwise returns error value.
37 +ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
38 + struct ext4_ext_path *org_path,
39 + struct ext4_extent *o_start, struct ext4_extent *o_end,
40 + struct ext4_extent *start_ext, struct ext4_extent *new_ext,
41 + struct ext4_extent *end_ext, ext4_fsblk_t replaced)
47 + * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
49 + * @handle: journal handle
50 + * @org_inode: original inode
51 + * @org_path: path indicates first extent to be defraged
52 + * @dext: destination extent
53 + * @from: start offset on the target file
55 + * This function returns 0 if succeed, otherwise returns error value.
58 +ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
59 + struct ext4_ext_path *org_path, struct ext4_extent *dext,
62 + struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
63 + struct ext4_extent new_ext, start_ext, end_ext;
64 + ext4_fsblk_t replaced = 0;
65 + ext4_lblk_t new_end, lblock;
66 + unsigned long depth;
68 + ext4_fsblk_t new_phys_end;
71 + depth = ext_depth(org_inode);
72 + start_ext.ee_len = end_ext.ee_len = 0;
73 + o_start = o_end = oext = org_path[depth].p_ext;
74 + ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
75 + new_ext.ee_len = dext->ee_len;
76 + len = le16_to_cpu(new_ext.ee_len);
77 + new_ext.ee_block = cpu_to_le32(*from);
78 + lblock = le32_to_cpu(oext->ee_block);
79 + new_end = le32_to_cpu(new_ext.ee_block)
80 + + le16_to_cpu(new_ext.ee_len) - 1;
81 + new_phys_end = ext_pblock(&new_ext)
82 + + le16_to_cpu(new_ext.ee_len) - 1;
85 + * First original extent
86 + * dest |---------------|
87 + * org |---------------|
89 + if (le32_to_cpu(new_ext.ee_block) >
90 + le32_to_cpu(oext->ee_block) &&
91 + le32_to_cpu(new_ext.ee_block) <
92 + le32_to_cpu(oext->ee_block)
93 + + le16_to_cpu(oext->ee_len)) {
94 + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
95 + - le32_to_cpu(oext->ee_block));
96 + replaced += le16_to_cpu(oext->ee_len)
97 + - le16_to_cpu(start_ext.ee_len);
98 + } else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
99 + /* We can merge previous extent. */
100 + prev_ext = oext - 1;
101 + if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
102 + == ext_pblock(&new_ext))
103 + && (le32_to_cpu(prev_ext->ee_block)
104 + + le16_to_cpu(prev_ext->ee_len)
105 + == le32_to_cpu(new_ext.ee_block))) {
106 + o_start = prev_ext;
107 + start_ext.ee_len = cpu_to_le16(
108 + le16_to_cpu(prev_ext->ee_len)
109 + + le16_to_cpu(new_ext.ee_len));
110 + new_ext.ee_len = 0;
115 + /* The extent for destination must be found. */
116 + BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
117 + lblock += le16_to_cpu(oext->ee_len);
120 + * Middle of original extent
121 + * dest |-------------------|
122 + * org |-----------------|
124 + if (le32_to_cpu(new_ext.ee_block) <=
125 + le32_to_cpu(oext->ee_block) &&
126 + new_end >= le32_to_cpu(oext->ee_block)
127 + + le16_to_cpu(oext->ee_len) - 1)
128 + replaced += le16_to_cpu(oext->ee_len);
131 + * Last original extent
132 + * dest |----------------|
133 + * org |---------------|
135 + if (new_end >= le32_to_cpu(oext->ee_block) &&
136 + new_end < le32_to_cpu(oext->ee_block)
137 + + le16_to_cpu(oext->ee_len) - 1) {
139 + = cpu_to_le16(le32_to_cpu(oext->ee_block)
140 + + le16_to_cpu(oext->ee_len) - 1 - new_end);
141 + ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
142 + + le16_to_cpu(oext->ee_len)
143 + - le16_to_cpu(end_ext.ee_len)));
145 + = cpu_to_le32(le32_to_cpu(o_end->ee_block)
146 + + le16_to_cpu(oext->ee_len)
147 + - le16_to_cpu(end_ext.ee_len));
148 + replaced += le16_to_cpu(oext->ee_len)
149 + - le16_to_cpu(end_ext.ee_len);
153 + * Detected the block end, reached the number of replaced
154 + * blocks to dext->ee_len. Then merge the extent.
156 + if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
157 + new_end <= le32_to_cpu(oext->ee_block)
158 + + le16_to_cpu(oext->ee_len) - 1) {
159 + ret = ext4_defrag_merge_extents(handle, org_inode,
160 + org_path, o_start, o_end, &start_ext,
161 + &new_ext, &end_ext, replaced);
165 + /* All expected blocks are replaced */
166 + if (le16_to_cpu(new_ext.ee_len) <= 0)
169 + /* Re-calculate new_ext */
170 + le16_add_cpu(&new_ext.ee_len, -replaced);
171 + le32_add_cpu(&new_ext.ee_block, replaced);
172 + ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
175 + start_ext.ee_len = end_ext.ee_len = 0;
178 + /* All expected blocks are replaced. */
179 + if (le16_to_cpu(new_ext.ee_len) <= 0)
183 + /* Get the next extent for original. */
185 + ext4_ext_drop_refs(org_path);
186 + org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
187 + if (IS_ERR(org_path)) {
188 + ret = PTR_ERR(org_path);
192 + depth = ext_depth(org_inode);
193 + oext = org_path[depth].p_ext;
194 + if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
205 + * ext4_defrag_replace_branches - Replace original extents with new extents
207 + * @handle: journal handle
208 + * @org_inode: original inode
209 + * @dest_inode: temporary inode
210 + * @from: block offset of org_inode
211 + * @dest_off: block offset of dest_inode
212 + * @count: block count to be replaced
214 + * This function returns 0 if succeed, otherwise returns error value.
215 + * Replace extents for blocks from "from" to "from + count - 1".
218 +ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
219 + struct inode *dest_inode, ext4_lblk_t from,
220 + ext4_lblk_t dest_off, ext4_lblk_t count)
222 + struct ext4_ext_path *org_path = NULL;
223 + struct ext4_ext_path *dest_path = NULL;
224 + struct ext4_extent *oext, *dext, *swap_ext;
225 + struct ext4_extent tmp_ext, tmp_ext2;
226 + ext4_lblk_t diff, org_diff;
229 + int replaced_count = 0;
231 + /* Get the original extent for the block "from" */
232 + org_path = ext4_ext_find_extent(org_inode, from, NULL);
233 + if (IS_ERR(org_path)) {
234 + err = PTR_ERR(org_path);
239 + /* Get the destination extent for the head */
240 + dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
241 + if (IS_ERR(dest_path)) {
242 + err = PTR_ERR(dest_path);
246 + depth = ext_depth(dest_inode);
247 + dext = dest_path[depth].p_ext;
248 + /* When dext is too large, pick up the target range. */
249 + diff = dest_off - le32_to_cpu(dext->ee_block);
250 + ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
251 + tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
252 + tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
253 + if (count < le16_to_cpu(tmp_ext.ee_len))
254 + tmp_ext.ee_len = cpu_to_le16(count);
257 + depth = ext_depth(org_inode);
258 + oext = org_path[depth].p_ext;
259 + org_diff = from - le32_to_cpu(oext->ee_block);
260 + ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
261 + tmp_ext2.ee_block = tmp_ext.ee_block;
263 + /* Adjust extent length when blocksize != pagesize */
264 + if (le16_to_cpu(tmp_ext.ee_len) <=
265 + le16_to_cpu(oext->ee_len) - org_diff) {
266 + tmp_ext2.ee_len = tmp_ext.ee_len;
268 + tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
270 + tmp_ext.ee_len = tmp_ext2.ee_len;
272 + swap_ext = &tmp_ext2;
274 + /* Loop for the destination extents */
276 + /* The extent for destination must be found. */
277 + BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
279 + /* Loop for the original extent blocks */
280 + err = ext4_defrag_leaf_block(handle, org_inode,
281 + org_path, dext, &from);
286 + * We need the function which fixes extent information for
288 + * e.g. ext4_defrag_merge_extents()
290 + err = ext4_defrag_leaf_block(handle, dest_inode,
291 + dest_path, swap_ext, &dest_off);
295 + replaced_count += le16_to_cpu(dext->ee_len);
296 + dest_off += le16_to_cpu(dext->ee_len);
297 + from += le16_to_cpu(dext->ee_len);
299 + /* Already moved the expected blocks */
300 + if (replaced_count >= count)
304 + ext4_ext_drop_refs(org_path);
305 + org_path = ext4_ext_find_extent(org_inode, from, NULL);
306 + if (IS_ERR(org_path)) {
307 + err = PTR_ERR(org_path);
311 + depth = ext_depth(org_inode);
312 + oext = org_path[depth].p_ext;
313 + if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
320 + ext4_ext_drop_refs(dest_path);
321 + dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
322 + if (IS_ERR(dest_path)) {
323 + err = PTR_ERR(dest_path);
327 + depth = ext_depth(dest_inode);
328 + dext = dest_path[depth].p_ext;
329 + if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
335 + /* When dext is too large, pick up the target range. */
336 + diff = dest_off - le32_to_cpu(dext->ee_block);
337 + ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
339 + cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
340 + tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
342 + if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
343 + tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
347 + org_diff = from - le32_to_cpu(oext->ee_block);
348 + ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
349 + tmp_ext2.ee_block = tmp_ext.ee_block;
351 + /* Adjust extent length when blocksize != pagesize */
352 + if (le16_to_cpu(tmp_ext.ee_len) <=
353 + le16_to_cpu(oext->ee_len) - org_diff) {
354 + tmp_ext2.ee_len = tmp_ext.ee_len;
356 + tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
358 + tmp_ext.ee_len = tmp_ext2.ee_len;
360 + swap_ext = &tmp_ext2;
365 + ext4_ext_drop_refs(org_path);
369 + ext4_ext_drop_refs(dest_path);
377 * ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
379 * @org_inode: original inode
380 @@ -185,7 +540,127 @@ ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
381 pgoff_t org_page_offset, ext4_lblk_t dest_blk_offset,
382 int data_offset_in_page, int block_len_in_page)
385 + struct inode *org_inode = filp->f_dentry->d_inode;
386 + struct address_space *mapping = org_inode->i_mapping;
387 + struct buffer_head *bh;
388 + struct page *page = NULL;
389 + const struct address_space_operations *a_ops = mapping->a_ops;
391 + ext4_lblk_t org_blk_offset;
392 + long long offs = org_page_offset << PAGE_CACHE_SHIFT;
393 + unsigned long blocksize = org_inode->i_sb->s_blocksize;
394 + unsigned int w_flags = 0;
395 + unsigned int tmp_data_len;
398 + int ret, i, jblocks;
399 + int blocks_per_page = PAGE_CACHE_SIZE >> org_inode->i_blkbits;
402 + * It needs twice the amount of ordinary journal buffers because
403 + * inode and tmp_inode may change each different metadata blocks.
405 + jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
406 + handle = ext4_journal_start(org_inode, jblocks);
407 + if (IS_ERR(handle)) {
408 + ret = PTR_ERR(handle);
412 + if (segment_eq(get_fs(), KERNEL_DS))
413 + w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
415 + org_blk_offset = org_page_offset * blocks_per_page +
416 + data_offset_in_page;
417 + offs = (long long)org_blk_offset << org_inode->i_blkbits;
419 + /* Calculate data_len */
420 + if ((org_blk_offset + block_len_in_page - 1) ==
421 + ((org_inode->i_size - 1) >> org_inode->i_blkbits)) {
422 + /* the case which we replace the last block */
423 + tmp_data_len = org_inode->i_size & (blocksize - 1);
425 + * If data_len equal zero, it shows data_len is multiples of
426 + * blocksize. So we set appropriate value.
428 + if (tmp_data_len == 0)
429 + tmp_data_len = blocksize;
431 + data_len = tmp_data_len +
432 + ((block_len_in_page - 1) << org_inode->i_blkbits);
434 + data_len = block_len_in_page << org_inode->i_blkbits;
437 + up_write(&EXT4_I(org_inode)->i_data_sem);
438 + ret = a_ops->write_begin(filp, mapping, offs, data_len, w_flags, &page,
440 + down_write(&EXT4_I(org_inode)->i_data_sem);
442 + if (unlikely(ret < 0))
445 + if (!PageUptodate(page)) {
446 + up_write(&EXT4_I(org_inode)->i_data_sem);
447 + mapping->a_ops->readpage(filp, page);
448 + down_write(&EXT4_I(org_inode)->i_data_sem);
453 + * try_to_release_page() doesn't call releasepage in writeback mode.
454 + * We should care about the order of writing to the same file
455 + * by multiple defrag processes.
456 + * It needs to call wait_on_page_writeback() to wait for the
457 + * writeback of the page.
459 + if (PageWriteback(page))
460 + wait_on_page_writeback(page);
462 + /* Release old bh and drop refs */
463 + try_to_release_page(page, 0);
464 + ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
465 + org_blk_offset, dest_blk_offset,
466 + block_len_in_page);
470 + /* Clear the inode cache not to refer to the old data */
471 + ext4_ext_invalidate_cache(org_inode);
473 + if (!page_has_buffers(page))
474 + create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
476 + bh = page_buffers(page);
477 + for (i = 0; i < data_offset_in_page; i++)
478 + bh = bh->b_this_page;
480 + for (i = 0; i < block_len_in_page; i++) {
481 + up_write(&EXT4_I(org_inode)->i_data_sem);
482 + ret = ext4_get_block(org_inode, (sector_t)(org_blk_offset + i),
484 + down_write(&EXT4_I(org_inode)->i_data_sem);
489 + if (bh->b_this_page != NULL)
490 + bh = bh->b_this_page;
493 + ret = a_ops->write_end(filp, mapping, offs, data_len, data_len, page,
498 + if (unlikely(page)) {
499 + if (PageLocked(page))
501 + page_cache_release(page);
503 + ext4_journal_stop(handle);
505 + return ret < 0 ? ret : 0;