1 ext4: online defrag-- Main function of defrag and ioctl implementation
3 From: Akira Fujita <a-fujita@rs.jp.nec.com>
5 Create the temporary inode and do defrag per
6 defrag_size (defalut 64MB).
8 Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
9 Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
12 fs/ext4/defrag.c | 448 +++++++++++++++++++++++++++++++++++++++++++++++++
14 fs/ext4/ext4_extents.h | 2
17 6 files changed, 473 insertions(+), 2 deletions(-)
19 Index: linux-2.6.26-rc9/fs/ext4/Makefile
20 ===================================================================
21 --- linux-2.6.26-rc9.orig/fs/ext4/Makefile 2008-07-11 16:04:25.000000000 -0700
22 +++ linux-2.6.26-rc9/fs/ext4/Makefile 2008-07-11 16:05:18.000000000 -0700
23 @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
25 ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
26 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
27 - ext4_jbd2.o migrate.o mballoc.o
28 + ext4_jbd2.o migrate.o mballoc.o defrag.o
30 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
31 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
32 Index: linux-2.6.26-rc9/fs/ext4/defrag.c
33 ===================================================================
34 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
35 +++ linux-2.6.26-rc9/fs/ext4/defrag.c 2008-07-11 16:05:18.000000000 -0700
38 + * Copyright (c) 2008, NEC Software Tohoku, Ltd.
39 + * Written by Takashi Sato <t-sato@yk.jp.nec.com>
40 + * Akira Fujita <a-fujita@rs.jp.nec.com>
42 + * This program is free software; you can redistribute it and/or modify it
43 + * under the terms of version 2.1 of the GNU Lesser General Public License
44 + * as published by the Free Software Foundation.
46 + * This program is distributed in the hope that it will be useful,
47 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
48 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
49 + * GNU General Public License for more details.
52 +/* Online defragmentation for EXT4 */
54 +#include <linux/quotaops.h>
55 +#include "ext4_jbd2.h"
56 +#include "ext4_extents.h"
60 + * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
62 + * @inode: inode which is searched
63 + * @path: this will obtain data for the next extent
64 + * @extent: pointer to the next extent we have just gotten
66 + * This function returns 0 or 1(last entry) if succeed, otherwise
70 +ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path,
71 + struct ext4_extent **extent)
76 +int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
81 + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
82 + printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
83 + "based file\n", inode->i_ino);
87 + if (cmd == EXT4_IOC_DEFRAG) {
88 + struct ext4_ext_defrag_data defrag;
89 + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
91 + if (!capable(CAP_DAC_OVERRIDE)) {
92 + if ((inode->i_mode & S_IRUSR) != S_IRUSR)
94 + if (current->fsuid != inode->i_uid)
98 + if (copy_from_user(&defrag,
99 + (struct ext4_ext_defrag_data __user *)arg,
103 + /* Check goal offset if goal offset was given from userspace */
104 + if (defrag.goal != -1 &&
105 + ext4_blocks_count(es) <= defrag.goal) {
106 + printk(KERN_ERR "ext4 defrag: Invalid goal offset"
107 + " %llu, you can set goal offset up to %llu\n",
108 + defrag.goal, ext4_blocks_count(es) - 1);
112 + err = ext4_defrag(filp, defrag.start_offset,
113 + defrag.defrag_size);
120 + * ext4_defrag_partial - Defrag a file per page
122 + * @tmp_inode: temporary inode
123 + * @filp: pointer to file
124 + * @org_offset: page index on original file
125 + * @dest_offset: page index on temporary file
128 + * This function returns 0 if succeed, otherwise returns error value.
131 +ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
132 + pgoff_t org_offset, pgoff_t dest_offset)
138 + * ext4_defrag_new_extent_tree - Get contiguous blocks and build an extent tree
140 + * @org_inode: original inode
141 + * @tmp_inode: temporary inode
142 + * @org_path: indicating the original inode's extent
143 + * @tar_start: starting offset to allocate in blocks
144 + * @tar_blocks: the number of blocks to allocate
145 + * @iblock: file related offset
148 + * This function returns the value as below:
151 + * negative value (error case)
154 +ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
155 + struct ext4_ext_path *org_path, ext4_lblk_t tar_start,
156 + ext4_lblk_t tar_blocks, ext4_lblk_t iblock)
162 + * ext4_defrag_check - Check the enviroment whether a defrag can be done
164 + * @org_inode: original inode
165 + * @defrag_size: size of defrag in blocks
167 + * This function returns 0 if succeed, otherwise returns error value.
170 +ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size)
173 + /* ext4 online defrag supports only 4KB block size */
174 + if (org_inode->i_sb->s_blocksize != DEFRAG_BLOCK_SIZE) {
175 + printk(KERN_ERR "ext4 defrag: ext4 online defrag supports "
176 + "only 4KB block size for the moment.\n");
177 + return -EOPNOTSUPP;
180 + /* ext4 online defrag needs mballoc mount option. */
181 + if (!test_opt(org_inode->i_sb, MBALLOC)) {
182 + printk(KERN_ERR "ext4 defrag: multiblock allocation "
184 + return -EOPNOTSUPP;
191 + * ext4_defrag_init_tmp_inode - Create a temporary inode
193 + * @org_inode: original inode
195 + * This function returns pointer to the struct inode if succeed,
196 + * otherwise returns error value.
198 +static struct inode *
199 +ext4_defrag_init_tmp_inode(struct inode *org_inode)
202 + struct inode *tmp_inode;
204 + handle = ext4_journal_start(org_inode,
205 + EXT4_DATA_TRANS_BLOCKS(org_inode->i_sb) +
206 + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 4 +
207 + 2 * EXT4_QUOTA_INIT_BLOCKS(org_inode->i_sb));
208 + if (IS_ERR(handle))
209 + /* Return error code */
210 + return (struct inode *)handle;
212 + tmp_inode = ext4_new_inode(handle,
213 + org_inode->i_sb->s_root->d_inode, S_IFREG);
214 + if (IS_ERR(tmp_inode))
217 + i_size_write(tmp_inode, i_size_read(org_inode));
218 + tmp_inode->i_nlink = 0;
219 + ext4_ext_tree_init(handle, tmp_inode);
220 + ext4_orphan_add(handle, tmp_inode);
223 + ext4_journal_stop(handle);
229 + * ext4_defrag - Defrag the specified range of a file
231 + * If no-option is specified, ext4_defrag() proceeds the following order.
232 + * 1.ext4_defrag() calculates the block number where defrag terminates
233 + * by the start block number(defrag_start) and the size of defraged data
234 + * (defrag_size) specified as arguments.
235 + * If the defrag_start points a hole, the extent's start offset pointed by
236 + * ext_cur(current extent), holecheck_path, org_path are set after
238 + * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent
239 + * or the ext_cur exceeds the block_end which is last logical block number.
240 + * 3.To get a length of continues area, call ext4_defrag_next_extent()
241 + * specified with the ext_cur(initial value is holecheck_path) re-cursive,
242 + * until find un-continuous extent, the start logical block number exceeds
243 + * the block_end or the extent points to the last extent.
244 + * 4.After determining the length of continuous block,
245 + * allocates continuous blocks to a temporary inode
246 + * by ext4_defrag_new_extent_tree().
247 + * 5.Exchange the original inode data with temporary inode data
248 + * from page_offset to seq_end_page by page unit.
249 + * The start page index of data are specified as arguments:
250 + * the original inode is page_offset, the temporary inode is dest_offset.
251 + * 6.Update holecheck_path and org_path to points a next proceeding extent,
252 + * and release the temporary inode holding the original fragmented data.
253 + * Then, returns to step 2.
254 + * 7.Release holecheck_path, org_path and temporary inode,
255 + * and returns the defrag_size which is the size of defraged data.
256 + * The defrag_size is used for the command to calculate the file offset
257 + * where a next defrag processing start.
258 + * (Since the defrag command calls defrag_ioctl() by 64MB unit,
259 + * a file bigger than 64MB calls defrag_ioctl many times.)
261 + * @filp: pointer to file
262 + * @block_start: starting offset to defrag in blocks
263 + * @defrag_size: size of defrag in blocks
265 + * This function returns the number of blocks if succeed, otherwise
266 + * returns error value.
269 +ext4_defrag(struct file *filp, ext4_lblk_t block_start,
270 + ext4_lblk_t defrag_size)
272 + struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
273 + struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
274 + struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
275 + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
276 + pgoff_t page_offset, seq_end_page, dest_offset;
277 + int ret, depth, seq_extents, last_extent = 0;
279 + /* Check the filesystem enviroment whether defrag can be done */
280 + ret = ext4_defrag_check(org_inode, defrag_size);
284 + file_end = (org_inode->i_size - 1) >> org_inode->i_blkbits;
285 + block_end = block_start + defrag_size - 1;
286 + if (file_end < block_end)
287 + defrag_size -= block_end - file_end;
289 + mutex_lock(&org_inode->i_mutex);
290 + down_write(&EXT4_I(org_inode)->i_data_sem);
292 + org_path = ext4_ext_find_extent(org_inode, block_start, NULL);
293 + if (IS_ERR(org_path)) {
294 + ret = PTR_ERR(org_path);
299 + /* Get path structure to check the hole */
300 + holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL);
301 + if (IS_ERR(holecheck_path)) {
302 + ret = PTR_ERR(holecheck_path);
303 + holecheck_path = NULL;
307 + depth = ext_depth(org_inode);
308 + ext_cur = holecheck_path[depth].p_ext;
309 + if (ext_cur == NULL)
313 + * Get proper extent whose ee_block is beyond block_start
314 + * if block_start was within the hole.
316 + if (le32_to_cpu(ext_cur->ee_block) +
317 + le16_to_cpu(ext_cur->ee_len) - 1 < block_start) {
318 + last_extent = ext4_defrag_next_extent(org_inode,
319 + holecheck_path, &ext_cur);
320 + if (last_extent < 0) {
324 + last_extent = ext4_defrag_next_extent(org_inode, org_path,
326 + if (last_extent < 0) {
332 + seq_start = le32_to_cpu(ext_cur->ee_block);
334 + /* No blocks within the specified range. */
335 + if (le32_to_cpu(ext_cur->ee_block) > block_end) {
336 + printk(KERN_INFO "ext4 defrag: The specified range of file"
337 + " may be the hole\n");
341 + /* Adjust start blocks */
342 + add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
343 + le16_to_cpu(ext_cur->ee_len), block_end + 1) -
344 + max(le32_to_cpu(ext_cur->ee_block), block_start);
346 + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
347 + seq_blocks += add_blocks;
349 + /* Create a temporary inode to be exchanged data block */
350 + tmp_inode = ext4_defrag_init_tmp_inode(org_inode);
351 + if (IS_ERR(tmp_inode)) {
352 + ret = PTR_ERR(tmp_inode);
357 + /* Adjust tail blocks */
358 + if (seq_start + seq_blocks - 1 > block_end)
359 + seq_blocks = block_end - seq_start + 1;
361 + ext_prev = ext_cur;
362 + last_extent = ext4_defrag_next_extent(org_inode,
363 + holecheck_path, &ext_cur);
364 + if (last_extent < 0) {
370 + add_blocks = le16_to_cpu(ext_cur->ee_len);
373 + * Extend the length of contiguous block (seq_blocks)
374 + * if extents are contiguous.
376 + if (le32_to_cpu(ext_prev->ee_block) +
377 + le16_to_cpu(ext_prev->ee_len) ==
378 + le32_to_cpu(ext_cur->ee_block) &&
379 + block_end >= le32_to_cpu(ext_cur->ee_block) &&
388 + /* Found an isolated block */
389 + if (seq_extents == 1) {
390 + seq_start = le32_to_cpu(ext_cur->ee_block);
394 + ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
395 + org_path, seq_start, seq_blocks,
400 + } else if (ret == 1) {
402 + seq_start = le32_to_cpu(ext_cur->ee_block);
406 + page_offset = seq_start >>
407 + (PAGE_CACHE_SHIFT - org_inode->i_blkbits);
409 + seq_end_page = (seq_start + seq_blocks - 1) >>
410 + (PAGE_CACHE_SHIFT - org_inode->i_blkbits);
411 + seq_start = le32_to_cpu(ext_cur->ee_block);
414 + * Discard all preallocations.
415 + * This is provisional solution.
416 + * When true ext4_mb_return_to_preallocation() is
417 + * implemented, this will be removed.
419 + ext4_mb_discard_inode_preallocations(org_inode);
421 + while (page_offset <= seq_end_page) {
422 + /* Swap original branches with new branches */
423 + ret = ext4_defrag_partial(tmp_inode, filp,
424 + page_offset, dest_offset);
432 + /* Decrease buffer counter */
433 + if (holecheck_path)
434 + ext4_ext_drop_refs(holecheck_path);
435 + holecheck_path = ext4_ext_find_extent(org_inode,
436 + seq_start, holecheck_path);
437 + if (IS_ERR(holecheck_path)) {
438 + ret = PTR_ERR(holecheck_path);
439 + holecheck_path = NULL;
442 + depth = holecheck_path->p_depth;
445 + /* Decrease buffer counter */
447 + ext4_ext_drop_refs(org_path);
448 + org_path = ext4_ext_find_extent(org_inode, seq_start, org_path);
449 + if (IS_ERR(org_path)) {
450 + ret = PTR_ERR(org_path);
455 + ext_cur = holecheck_path[depth].p_ext;
456 + add_blocks = le16_to_cpu(ext_cur->ee_len);
469 + ext4_ext_drop_refs(org_path);
472 + if (holecheck_path) {
473 + ext4_ext_drop_refs(holecheck_path);
474 + kfree(holecheck_path);
477 + up_write(&EXT4_I(org_inode)->i_data_sem);
478 + mutex_unlock(&org_inode->i_mutex);
483 + return (ret ? ret : defrag_size);
485 Index: linux-2.6.26-rc9/fs/ext4/ext4.h
486 ===================================================================
487 --- linux-2.6.26-rc9.orig/fs/ext4/ext4.h 2008-07-11 16:05:15.000000000 -0700
488 +++ linux-2.6.26-rc9/fs/ext4/ext4.h 2008-07-11 16:05:18.000000000 -0700
489 @@ -301,6 +301,7 @@ struct ext4_new_group_data {
490 #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
491 #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
492 #define EXT4_IOC_MIGRATE _IO('f', 7)
493 +#define EXT4_IOC_DEFRAG _IOW('f', 10, struct ext4_ext_defrag_data)
496 * ioctl commands in 32 bit emulation
497 @@ -318,6 +319,18 @@ struct ext4_new_group_data {
498 #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
499 #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
503 + * ext4 online defrag supports only 4KB block size.
505 +#define DEFRAG_BLOCK_SIZE 4096
507 +struct ext4_ext_defrag_data {
508 + ext4_lblk_t start_offset; /* start offset to defrag in blocks */
509 + ext4_lblk_t defrag_size; /* size of defrag in blocks */
510 + ext4_fsblk_t goal; /* block offset for allocation */
516 @@ -1124,6 +1137,11 @@ extern void ext4_inode_bitmap_set(struct
517 struct ext4_group_desc *bg, ext4_fsblk_t blk);
518 extern void ext4_inode_table_set(struct super_block *sb,
519 struct ext4_group_desc *bg, ext4_fsblk_t blk);
521 +extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
522 + ext4_lblk_t defrag_size);
523 +extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
526 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
528 Index: linux-2.6.26-rc9/fs/ext4/ext4_extents.h
529 ===================================================================
530 --- linux-2.6.26-rc9.orig/fs/ext4/ext4_extents.h 2008-07-11 16:05:13.000000000 -0700
531 +++ linux-2.6.26-rc9/fs/ext4/ext4_extents.h 2008-07-11 16:05:18.000000000 -0700
532 @@ -229,5 +229,7 @@ extern int ext4_ext_search_left(struct i
533 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
534 ext4_lblk_t *, ext4_fsblk_t *);
535 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
536 +extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
537 +extern void ext4_ext_drop_refs(struct ext4_ext_path *path);
538 #endif /* _EXT4_EXTENTS */
540 Index: linux-2.6.26-rc9/fs/ext4/extents.c
541 ===================================================================
542 --- linux-2.6.26-rc9.orig/fs/ext4/extents.c 2008-07-11 16:05:17.000000000 -0700
543 +++ linux-2.6.26-rc9/fs/ext4/extents.c 2008-07-11 16:05:18.000000000 -0700
546 * combine low and high parts of physical block number into ext4_fsblk_t
548 -static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
549 +ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
553 Index: linux-2.6.26-rc9/fs/ext4/ioctl.c
554 ===================================================================
555 --- linux-2.6.26-rc9.orig/fs/ext4/ioctl.c 2008-07-11 16:04:26.000000000 -0700
556 +++ linux-2.6.26-rc9/fs/ext4/ioctl.c 2008-07-11 16:05:18.000000000 -0700
557 @@ -241,6 +241,9 @@ setversion_out:
561 + case EXT4_IOC_DEFRAG: {
562 + return ext4_defrag_ioctl(inode, filp, cmd, arg);
564 case EXT4_IOC_GROUP_ADD: {
565 struct ext4_new_group_data input;
566 struct super_block *sb = inode->i_sb;