More patch description fixups. Standardize case.
[ext4-patch-queue.git] / delalloc-ext4.patch
blob3217adcc71491709458487360c0407a8f0d20dd5
1 ext4: Add basic delayed allocation support
3 From: Alex Tomas <alex@clusterfs.com>
5 Two special ->get_block() methods are introduced:
7 * ext4_da_get_block_prep()
8 to be used with ->prepare_write(), defers allocation till flush
9 * ext4_da_get_block_write()
10 to be used with mpage_da_writepages(), allocate blocks and correct
11 on-disk size
13 Current implementation works with data=writeback only, you should
14 mount filesystem with delalloc,data=writeback options.
16 TODO:
17 * reservation
18 * data=ordered
19 * quota
20 * bmap
22 Signed-off-by: Alex Tomas <alex@clusterfs.com>
23 Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
24 ---
26 fs/ext4/inode.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++
27 fs/ext4/super.c | 6 +-
28 include/linux/ext4_fs.h | 1
29 3 files changed, 166 insertions(+), 1 deletions(-)
32 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
33 index 2947800..6bb788d 100644
34 --- a/fs/ext4/inode.c
35 +++ b/fs/ext4/inode.c
36 @@ -39,6 +39,8 @@
37 #include "xattr.h"
38 #include "acl.h"
40 +static void ext4_invalidatepage(struct page *page, unsigned long offset);
43 * Test whether an inode is a fast symlink.
45 @@ -1352,6 +1354,146 @@ static int ext4_journalled_write_end(struct file *file,
49 + * this is a special callback for ->prepare_write() only
50 + * it's intention is to return mapped block or reserve space
51 + */
52 +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
53 + struct buffer_head *bh_result, int create)
55 + int ret = 0;
57 + BUG_ON(create == 0);
58 + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
60 + /* first, we need to know whether the block is allocated already
61 + * XXX: when the filesystem has a lot of free blocks, we could
62 + * reserve even allocated blocks to save this lookup */
63 + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0);
64 + if (ret >= 0) {
65 + if (buffer_mapped(bh_result)) {
66 + bh_result->b_size = (ret << inode->i_blkbits);
67 + } else {
68 + /* the block isn't allocated yet, let's reserve space */
69 + /* XXX: call reservation here */
70 + /*
71 + * XXX: __block_prepare_write() unmaps passed block,
72 + * is it OK?
73 + */
74 + map_bh(bh_result, inode->i_sb, 0);
75 + set_buffer_new(bh_result);
76 + set_buffer_delay(bh_result);
77 + }
78 + ret = 0;
79 + }
81 + return ret;
85 +static int ext4_da_prepare_write(struct file *file, struct page *page,
86 + unsigned from, unsigned to)
88 + return block_prepare_write(page, from, to, ext4_da_get_block_prep);
91 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
92 + struct buffer_head *bh_result, int create)
94 + int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
95 + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
96 + loff_t disksize = EXT4_I(inode)->i_disksize;
97 + handle_t *handle = NULL;
99 + if (create) {
100 + handle = ext4_journal_start(inode, needed_blocks);
101 + if (IS_ERR(handle)) {
102 + ret = PTR_ERR(handle);
103 + goto out;
107 + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
108 + bh_result, create, 0);
109 + if (ret > 0) {
110 + bh_result->b_size = (ret << inode->i_blkbits);
112 + /*
113 + * Update on-disk size along with block allocation
114 + * we don't use 'extend_disksize' as size may change
115 + * within already allocated block -bzzz
116 + */
117 + disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
118 + if (disksize > i_size_read(inode))
119 + disksize = i_size_read(inode);
120 + if (disksize > EXT4_I(inode)->i_disksize) {
121 + /*
122 + * XXX: replace with spinlock if seen contended -bzzz
123 + */
124 + down_write(&EXT4_I(inode)->i_data_sem);
125 + if (disksize > EXT4_I(inode)->i_disksize)
126 + EXT4_I(inode)->i_disksize = disksize;
127 + up_write(&EXT4_I(inode)->i_data_sem);
129 + if (EXT4_I(inode)->i_disksize == disksize) {
130 + if (handle == NULL)
131 + handle = ext4_journal_start(inode, 1);
132 + if (!IS_ERR(handle))
133 + ext4_mark_inode_dirty(handle, inode);
137 + ret = 0;
140 +out:
141 + if (handle && !IS_ERR(handle))
142 + ext4_journal_stop(handle);
144 + return ret;
147 +static int ext4_da_writepages(struct address_space *mapping,
148 + struct writeback_control *wbc)
150 + return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
153 +static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
155 + struct buffer_head *head, *bh;
156 + unsigned int curr_off = 0;
158 + /*
159 + * Drop reserved blocks
160 + */
161 + BUG_ON(!PageLocked(page));
162 + if (!page_has_buffers(page))
163 + goto out;
165 + head = page_buffers(page);
166 + bh = head;
167 + do {
168 + unsigned int next_off = curr_off + bh->b_size;
170 + /*
171 + * is this block fully invalidated?
172 + */
173 + if (offset <= curr_off && buffer_delay(bh)) {
174 + clear_buffer_delay(bh);
175 + /* XXX: add real stuff here */
177 + curr_off = next_off;
178 + bh = bh->b_this_page;
179 + } while (bh != head);
181 +out:
182 + ext4_invalidatepage(page, offset);
184 + return;
189 * bmap() is special. It gets used by applications such as lilo and by
190 * the swapper to find the on-disk block of a specific piece of data.
192 @@ -1802,10 +1944,28 @@ static const struct address_space_operations ext4_journalled_aops = {
193 .releasepage = ext4_releasepage,
196 +static const struct address_space_operations ext4_da_aops = {
197 + .readpage = ext4_readpage,
198 + .readpages = ext4_readpages,
199 + .writepage = ext4_writeback_writepage,
200 + .writepages = ext4_da_writepages,
201 + .sync_page = block_sync_page,
202 + .prepare_write = ext4_da_prepare_write,
203 + .commit_write = generic_commit_write,
204 + .bmap = ext4_bmap,
205 + .invalidatepage = ext4_da_invalidatepage,
206 + .releasepage = ext4_releasepage,
207 + .direct_IO = ext4_direct_IO,
208 + .migratepage = buffer_migrate_page,
211 void ext4_set_aops(struct inode *inode)
213 if (ext4_should_order_data(inode))
214 inode->i_mapping->a_ops = &ext4_ordered_aops;
215 + else if (ext4_should_writeback_data(inode) &&
216 + test_opt(inode->i_sb, DELALLOC))
217 + inode->i_mapping->a_ops = &ext4_da_aops;
218 else if (ext4_should_writeback_data(inode))
219 inode->i_mapping->a_ops = &ext4_writeback_aops;
220 else
221 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
222 index 092775f..6822f58 100644
223 --- a/fs/ext4/super.c
224 +++ b/fs/ext4/super.c
225 @@ -886,7 +886,7 @@ enum {
226 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
227 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
228 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
229 - Opt_mballoc, Opt_nomballoc, Opt_stripe,
230 + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
233 static match_table_t tokens = {
234 @@ -944,6 +944,7 @@ static match_table_t tokens = {
235 {Opt_mballoc, "mballoc"},
236 {Opt_nomballoc, "nomballoc"},
237 {Opt_stripe, "stripe=%u"},
238 + {Opt_delalloc, "delalloc"},
239 {Opt_err, NULL},
240 {Opt_resize, "resize"},
242 @@ -1306,6 +1307,9 @@ clear_qf_name:
243 return 0;
244 sbi->s_stripe = option;
245 break;
246 + case Opt_delalloc:
247 + set_opt(sbi->s_mount_opt, DELALLOC);
248 + break;
249 default:
250 printk (KERN_ERR
251 "EXT4-fs: Unrecognized mount option \"%s\" "
252 diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
253 index 1852313..adc16a9 100644
254 --- a/include/linux/ext4_fs.h
255 +++ b/include/linux/ext4_fs.h
256 @@ -521,6 +521,7 @@ do { \
257 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
258 #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
259 #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
260 +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
261 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
262 #ifndef _LINUX_EXT2_FS_H
263 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt