Further updates of Documentation/filesystem/ext4.txt
[ext4-patch-queue.git] / ext4-online-defrag-relocate-file-data.patch
blobc014094a9b425248687b24b16b1d33e444bdb6e5
1 ext4: online defrag-- Read and write file data with memory page
3 From: Akira Fujita <a-fujita@rs.jp.nec.com>
5 Read the file data from the old blocks to the page and
6 write the file data on the page into the new blocks.
8 Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
9 Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
10 ---
11 fs/ext4/defrag.c | 464 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
12 fs/ext4/ext4.h | 2
13 fs/ext4/inode.c | 3
14 3 files changed, 466 insertions(+), 3 deletions(-)
16 Index: linux-2.6.26-rc6/fs/ext4/defrag.c
17 ===================================================================
18 --- linux-2.6.26-rc6.orig/fs/ext4/defrag.c 2008-06-17 10:43:42.000000000 -0700
19 +++ linux-2.6.26-rc6/fs/ext4/defrag.c 2008-06-17 10:43:42.000000000 -0700
20 @@ -134,6 +134,368 @@ int ext4_defrag_ioctl(struct inode *inod
23 /**
24 + * ext4_defrag_merge_extents - Merge new extent
25 + *
26 + * @handle: journal handle
27 + * @org_inode: original inode
28 + * @org_path: path indicates first extent to be defraged
29 + * @o_start: first original extent to be defraged
30 + * @o_end: last original extent to be defraged
31 + * @start_ext: first new extent to be merged
32 + * @new_ext: middle of new extent to be merged
33 + * @end_ext: last new extent to be merged
34 + * @replaced: the number of blocks which will be replaced with new_ext
35 + *
36 + * This function returns 0 if succeed, otherwise returns error value.
37 + */
38 +static int
39 +ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
40 + struct ext4_ext_path *org_path,
41 + struct ext4_extent *o_start, struct ext4_extent *o_end,
42 + struct ext4_extent *start_ext, struct ext4_extent *new_ext,
43 + struct ext4_extent *end_ext, ext4_fsblk_t replaced)
45 + return 0;
48 +/**
49 + * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
50 + *
51 + * @handle: journal handle
52 + * @org_inode: original inode
53 + * @org_path: path indicates first extent to be defraged
54 + * @dext: destination extent
55 + * @from: start offset on the target file
56 + *
57 + * This function returns 0 if succeed, otherwise returns error value.
58 + */
59 +static int
60 +ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
61 + struct ext4_ext_path *org_path, struct ext4_extent *dext,
62 + ext4_lblk_t *from)
64 + struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
65 + struct ext4_extent new_ext, start_ext, end_ext;
66 + ext4_fsblk_t replaced = 0;
67 + ext4_lblk_t new_end, lblock;
68 + unsigned long depth;
69 + unsigned short len;
70 + ext4_fsblk_t new_phys_end;
71 + int ret;
73 + depth = ext_depth(org_inode);
74 + start_ext.ee_len = end_ext.ee_len = 0;
75 + o_start = o_end = oext = org_path[depth].p_ext;
76 + ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
77 + new_ext.ee_len = dext->ee_len;
78 + len = le16_to_cpu(new_ext.ee_len);
79 + new_ext.ee_block = cpu_to_le32(*from);
80 + lblock = le32_to_cpu(oext->ee_block);
81 + new_end = le32_to_cpu(new_ext.ee_block)
82 + + le16_to_cpu(new_ext.ee_len) - 1;
83 + new_phys_end = ext_pblock(&new_ext)
84 + + le16_to_cpu(new_ext.ee_len) - 1;
86 + /*
87 + * First original extent
88 + * dest |---------------|
89 + * org |---------------|
90 + */
91 + if (le32_to_cpu(new_ext.ee_block) >
92 + le32_to_cpu(oext->ee_block) &&
93 + le32_to_cpu(new_ext.ee_block) <
94 + le32_to_cpu(oext->ee_block)
95 + + le16_to_cpu(oext->ee_len)) {
96 + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
97 + - le32_to_cpu(oext->ee_block));
98 + replaced += le16_to_cpu(oext->ee_len)
99 + - le16_to_cpu(start_ext.ee_len);
100 + } else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
101 + /* We can merge previous extent. */
102 + prev_ext = oext - 1;
103 + if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
104 + == ext_pblock(&new_ext))
105 + && (le32_to_cpu(prev_ext->ee_block)
106 + + le16_to_cpu(prev_ext->ee_len)
107 + == le32_to_cpu(new_ext.ee_block))) {
108 + o_start = prev_ext;
109 + start_ext.ee_len = cpu_to_le16(
110 + le16_to_cpu(prev_ext->ee_len)
111 + + le16_to_cpu(new_ext.ee_len));
112 + new_ext.ee_len = 0;
116 + for (;;) {
117 + /* The extent for destination must be found. */
118 + BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
119 + lblock += le16_to_cpu(oext->ee_len);
121 + /*
122 + * Middle of original extent
123 + * dest |-------------------|
124 + * org |-----------------|
125 + */
126 + if (le32_to_cpu(new_ext.ee_block) <=
127 + le32_to_cpu(oext->ee_block) &&
128 + new_end >= le32_to_cpu(oext->ee_block)
129 + + le16_to_cpu(oext->ee_len) - 1)
130 + replaced += le16_to_cpu(oext->ee_len);
132 + /*
133 + * Last original extent
134 + * dest |----------------|
135 + * org |---------------|
136 + */
137 + if (new_end >= le32_to_cpu(oext->ee_block) &&
138 + new_end < le32_to_cpu(oext->ee_block)
139 + + le16_to_cpu(oext->ee_len) - 1) {
140 + end_ext.ee_len
141 + = cpu_to_le16(le32_to_cpu(oext->ee_block)
142 + + le16_to_cpu(oext->ee_len) - 1 - new_end);
143 + ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
144 + + le16_to_cpu(oext->ee_len)
145 + - le16_to_cpu(end_ext.ee_len)));
146 + end_ext.ee_block
147 + = cpu_to_le32(le32_to_cpu(o_end->ee_block)
148 + + le16_to_cpu(oext->ee_len)
149 + - le16_to_cpu(end_ext.ee_len));
150 + replaced += le16_to_cpu(oext->ee_len)
151 + - le16_to_cpu(end_ext.ee_len);
154 + /*
155 + * Detected the block end, reached the number of replaced
156 + * blocks to dext->ee_len. Then merge the extent.
157 + */
158 + if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
159 + new_end <= le32_to_cpu(oext->ee_block)
160 + + le16_to_cpu(oext->ee_len) - 1) {
161 + ret = ext4_defrag_merge_extents(handle, org_inode,
162 + org_path, o_start, o_end, &start_ext,
163 + &new_ext, &end_ext, replaced);
164 + if (ret < 0)
165 + return ret;
167 + /* All expected blocks are replaced */
168 + if (le16_to_cpu(new_ext.ee_len) <= 0)
169 + return 0;
171 + /* Re-calculate new_ext */
172 + le16_add_cpu(&new_ext.ee_len, -replaced);
173 + le32_add_cpu(&new_ext.ee_block, replaced);
174 + ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
175 + + replaced);
176 + replaced = 0;
177 + start_ext.ee_len = end_ext.ee_len = 0;
178 + o_start = NULL;
180 + /* All expected blocks are replaced. */
181 + if (le16_to_cpu(new_ext.ee_len) <= 0)
182 + return 0;
185 + /* Get the next extent for original. */
186 + if (org_path)
187 + ext4_ext_drop_refs(org_path);
188 + org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
189 + if (IS_ERR(org_path)) {
190 + ret = PTR_ERR(org_path);
191 + org_path = NULL;
192 + return ret;
194 + depth = ext_depth(org_inode);
195 + oext = org_path[depth].p_ext;
196 + if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
197 + <= lblock)
198 + return -ENOENT;
200 + o_end = oext;
201 + if (!o_start)
202 + o_start = oext;
206 +/**
207 + * ext4_defrag_replace_branches - Replace original extents with new extents
209 + * @handle: journal handle
210 + * @org_inode: original inode
211 + * @dest_inode: temporary inode
212 + * @from_page: page offset of org_inode
213 + * @dest_from_page: page offset of dest_inode
214 + * @count_page: page count to be replaced
216 + * This function returns 0 if succeed, otherwise returns error value.
217 + * Replace extents for blocks from "from" to "from + count - 1".
218 + */
219 +static int
220 +ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
221 + struct inode *dest_inode, pgoff_t from_page,
222 + pgoff_t dest_from_page, pgoff_t count_page)
224 + struct ext4_ext_path *org_path = NULL;
225 + struct ext4_ext_path *dest_path = NULL;
226 + struct ext4_extent *oext, *dext, *swap_ext;
227 + struct ext4_extent tmp_ext, tmp_ext2;
228 + ext4_lblk_t from, count, dest_off, diff, org_diff;
229 + int err = 0;
230 + int depth;
231 + int replaced_count = 0;
233 + from = (ext4_lblk_t)from_page <<
234 + (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
235 + count = (ext4_lblk_t)count_page <<
236 + (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
237 + dest_off = (ext4_lblk_t)dest_from_page <<
238 + (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
240 + /* Get the original extent for the block "from" */
241 + org_path = ext4_ext_find_extent(org_inode, from, NULL);
242 + if (IS_ERR(org_path)) {
243 + err = PTR_ERR(org_path);
244 + org_path = NULL;
245 + goto out;
248 + /* Get the destination extent for the head */
249 + dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
250 + if (IS_ERR(dest_path)) {
251 + err = PTR_ERR(dest_path);
252 + dest_path = NULL;
253 + goto out;
255 + depth = ext_depth(dest_inode);
256 + dext = dest_path[depth].p_ext;
257 + /* When dext is too large, pick up the target range. */
258 + diff = dest_off - le32_to_cpu(dext->ee_block);
259 + ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
260 + tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
261 + tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
262 + if (count < le16_to_cpu(tmp_ext.ee_len))
263 + tmp_ext.ee_len = cpu_to_le16(count);
264 + dext = &tmp_ext;
266 + depth = ext_depth(org_inode);
267 + oext = org_path[depth].p_ext;
268 + org_diff = from - le32_to_cpu(oext->ee_block);
269 + ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
270 + tmp_ext2.ee_block = tmp_ext.ee_block;
272 + /* Adjust extent length when blocksize != pagesize */
273 + if (le16_to_cpu(tmp_ext.ee_len) <=
274 + le16_to_cpu(oext->ee_len) - org_diff) {
275 + tmp_ext2.ee_len = tmp_ext.ee_len;
276 + } else {
277 + tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
278 + - org_diff);
279 + tmp_ext.ee_len = tmp_ext2.ee_len;
281 + swap_ext = &tmp_ext2;
283 + /* Loop for the destination extents */
284 + while (1) {
285 + /* The extent for destination must be found. */
286 + BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
288 + /* Loop for the original extent blocks */
289 + err = ext4_defrag_leaf_block(handle, org_inode,
290 + org_path, dext, &from);
291 + if (err < 0)
292 + goto out;
294 + /*
295 + * We need the function which fixes extent information for
296 + * inserting.
297 + * e.g. ext4_defrag_merge_extents()
298 + */
299 + err = ext4_defrag_leaf_block(handle, dest_inode,
300 + dest_path, swap_ext, &dest_off);
301 + if (err < 0)
302 + goto out;
304 + replaced_count += le16_to_cpu(dext->ee_len);
305 + dest_off += le16_to_cpu(dext->ee_len);
306 + from += le16_to_cpu(dext->ee_len);
308 + /* Already moved the expected blocks */
309 + if (replaced_count >= count)
310 + break;
312 + if (org_path)
313 + ext4_ext_drop_refs(org_path);
314 + org_path = ext4_ext_find_extent(org_inode, from, NULL);
315 + if (IS_ERR(org_path)) {
316 + err = PTR_ERR(org_path);
317 + org_path = NULL;
318 + goto out;
320 + depth = ext_depth(org_inode);
321 + oext = org_path[depth].p_ext;
322 + if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
323 + <= from) {
324 + err = 0;
325 + goto out;
328 + if (dest_path)
329 + ext4_ext_drop_refs(dest_path);
330 + dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
331 + if (IS_ERR(dest_path)) {
332 + err = PTR_ERR(dest_path);
333 + dest_path = NULL;
334 + goto out;
336 + depth = ext_depth(dest_inode);
337 + dext = dest_path[depth].p_ext;
338 + if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
339 + <= dest_off) {
340 + err = 0;
341 + goto out;
344 + /* When dext is too large, pick up the target range. */
345 + diff = dest_off - le32_to_cpu(dext->ee_block);
346 + ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
347 + tmp_ext.ee_block =
348 + cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
349 + tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
351 + if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
352 + tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
354 + dext = &tmp_ext;
356 + org_diff = from - le32_to_cpu(oext->ee_block);
357 + ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
358 + tmp_ext2.ee_block = tmp_ext.ee_block;
360 + /* Adjust extent length when blocksize != pagesize */
361 + if (le16_to_cpu(tmp_ext.ee_len) <=
362 + le16_to_cpu(oext->ee_len) - org_diff) {
363 + tmp_ext2.ee_len = tmp_ext.ee_len;
364 + } else {
365 + tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
366 + - org_diff);
367 + tmp_ext.ee_len = tmp_ext2.ee_len;
369 + swap_ext = &tmp_ext2;
372 +out:
373 + if (org_path) {
374 + ext4_ext_drop_refs(org_path);
375 + kfree(org_path);
377 + if (dest_path) {
378 + ext4_ext_drop_refs(dest_path);
379 + kfree(dest_path);
382 + return err;
385 +/**
386 * ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
388 * @org_inode: original inode
389 @@ -226,7 +588,107 @@ static int
390 ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
391 pgoff_t org_offset, pgoff_t dest_offset)
393 - return 0;
394 + struct inode *org_inode = filp->f_dentry->d_inode;
395 + struct address_space *mapping = org_inode->i_mapping;
396 + struct buffer_head *bh;
397 + struct page *page;
398 + const struct address_space_operations *a_ops = mapping->a_ops;
399 + handle_t *handle;
400 + pgoff_t offset_in_page = PAGE_SIZE;
401 + int ret, i, jblocks, blocks_per_page;
402 + int blocksize = org_inode->i_sb->s_blocksize;
403 + long long offs = org_offset << PAGE_CACHE_SHIFT;
404 + unsigned long blk_off = 0;
405 + unsigned int w_flags = 0;
406 + void *fsdata;
408 + /*
409 + * It needs twice the amount of ordinary journal buffers because
410 + * inode and tmp_inode may change each different metadata blocks.
411 + */
412 + jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
413 + handle = ext4_journal_start(org_inode, jblocks);
414 + if (IS_ERR(handle)) {
415 + ret = PTR_ERR(handle);
416 + return ret;
419 + if (segment_eq(get_fs(), KERNEL_DS))
420 + w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
422 + if (org_offset == ((org_inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
423 + offset_in_page = (org_inode->i_size & (PAGE_CACHE_SIZE - 1));
424 + /*
425 + * Set PAGE_CACHE_SIZE to offset_in_page not be 0
426 + * if org_offset is the last page and i_size is
427 + * multiples of PAGE_CACHE_SIZE.
428 + */
429 + if (offset_in_page == 0)
430 + offset_in_page = PAGE_CACHE_SIZE;
433 + up_write(&EXT4_I(org_inode)->i_data_sem);
434 + ret = a_ops->write_begin(filp, mapping, offs,
435 + offset_in_page, w_flags, &page, &fsdata);
436 + down_write(&EXT4_I(org_inode)->i_data_sem);
438 + if (unlikely(ret < 0))
439 + goto out;
441 + if (!PageUptodate(page)) {
442 + mapping->a_ops->readpage(filp, page);
443 + lock_page(page);
446 + /*
447 + * try_to_release_page() doesn't call relasepage in writeback mode.
448 + * We should care about the order of writing to the same file
449 + * by multiple defrag processes.
450 + * It needs to call wait_on_page_writeback() to wait for the
451 + * writeback of the page.
452 + */
453 + if (PageWriteback(page))
454 + wait_on_page_writeback(page);
456 + /* Release old bh and drop refs */
457 + try_to_release_page(page, 0);
458 + ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
459 + org_offset, dest_offset, 1);
461 + if (ret < 0)
462 + goto out;
464 + /* Clear the inode cache not to refer to the old data */
465 + ext4_ext_invalidate_cache(org_inode);
467 + if (!page_has_buffers(page))
468 + create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
470 + blocks_per_page = PAGE_SIZE / blocksize;
471 + blk_off = org_offset * blocks_per_page;
473 + bh = page_buffers(page);
474 + for (i = 0; i < blocks_per_page; i++) {
475 + up_write(&EXT4_I(org_inode)->i_data_sem);
476 + ret = ext4_get_block(org_inode, blk_off++, bh, 0);
477 + down_write(&EXT4_I(org_inode)->i_data_sem);
479 + if (ret < 0)
480 + goto out;
482 + if (bh->b_this_page != NULL)
483 + bh = bh->b_this_page;
486 + ret = a_ops->write_end(filp, mapping, offs, offset_in_page,
487 + offset_in_page, page, fsdata);
489 + if (unlikely(ret < 0))
490 + goto out;
491 +out:
492 + ext4_journal_stop(handle);
494 + return (ret < 0 ? ret : 0);
498 Index: linux-2.6.26-rc6/fs/ext4/ext4.h
499 ===================================================================
500 --- linux-2.6.26-rc6.orig/fs/ext4/ext4.h 2008-06-17 10:43:42.000000000 -0700
501 +++ linux-2.6.26-rc6/fs/ext4/ext4.h 2008-06-17 10:43:42.000000000 -0700
502 @@ -1083,6 +1083,8 @@ extern int ext4_writepage_trans_blocks(s
503 extern int ext4_block_truncate_page(handle_t *handle,
504 struct address_space *mapping, loff_t from);
505 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
506 +extern int ext4_get_block(struct inode *inode, sector_t iblock,
507 + struct buffer_head *bh_result, int create);
509 /* ioctl.c */
510 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
511 Index: linux-2.6.26-rc6/fs/ext4/inode.c
512 ===================================================================
513 --- linux-2.6.26-rc6.orig/fs/ext4/inode.c 2008-06-17 10:43:39.000000000 -0700
514 +++ linux-2.6.26-rc6/fs/ext4/inode.c 2008-06-17 10:43:42.000000000 -0700
515 @@ -1064,8 +1064,7 @@ int ext4_get_blocks_wrap(handle_t *handl
516 up_write((&EXT4_I(inode)->i_data_sem));
517 return retval;
520 -static int ext4_get_block(struct inode *inode, sector_t iblock,
521 +int ext4_get_block(struct inode *inode, sector_t iblock,
522 struct buffer_head *bh_result, int create)
524 handle_t *handle = ext4_journal_current_handle();