Don't actually panic if a journal checksum failure is detected on
[ext4-patch-queue.git] / delalloc-ext4.patch
blobb8ce0465530ee2c5b8207a23de574b4e5a4c185f
1 ext4: Add basic delayed allocation support
3 From: Alex Tomas <alex@clusterfs.com>
5 Two special ->get_block() methods are introduced:
7 * ext4_da_get_block_prep()
8 to be used with ->write_begin(), defers allocation till flush
9 * ext4_da_get_block_write()
10 to be used with mpage_da_writepages(), allocate blocks and correct
11 on-disk size
13 Current implementation works with data=writeback only, you should
14 mount filesystem with delalloc,data=writeback options.
16 TODO:
17 * reservation
18 * data=ordered
19 * quota
20 * bmap
22 Signed-off-by: Alex Tomas <alex@clusterfs.com>
23 Signed-off-by: Mingming Cao <cmm@us.ibm.com>
24 Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
25 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
26 ---
28 fs/ext4/ext4.h | 1
29 fs/ext4/inode.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
30 fs/ext4/super.c | 6 +
31 3 files changed, 182 insertions(+), 1 deletion(-)
34 Index: linux-2.6.26-rc4/fs/ext4/inode.c
35 ===================================================================
36 --- linux-2.6.26-rc4.orig/fs/ext4/inode.c 2008-05-29 10:52:37.000000000 -0700
37 +++ linux-2.6.26-rc4/fs/ext4/inode.c 2008-05-29 10:52:39.000000000 -0700
38 @@ -39,6 +39,8 @@
39 #include "xattr.h"
40 #include "acl.h"
42 +static void ext4_invalidatepage(struct page *page, unsigned long offset);
45 * Test whether an inode is a fast symlink.
47 @@ -1435,6 +1437,162 @@ static int ext4_journalled_write_end(str
51 + * this is a special callback for ->prepare_write() only
52 + * it's intention is to return mapped block or reserve space
53 + */
54 +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
55 + struct buffer_head *bh_result, int create)
57 + int ret = 0;
59 + BUG_ON(create == 0);
60 + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
62 + /* first, we need to know whether the block is allocated already
63 + * XXX: when the filesystem has a lot of free blocks, we could
64 + * reserve even allocated blocks to save this lookup */
65 + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0);
66 + if (ret >= 0) {
67 + if (buffer_mapped(bh_result)) {
68 + bh_result->b_size = (ret << inode->i_blkbits);
69 + } else {
70 + /* the block isn't allocated yet, let's reserve space */
71 + /* XXX: call reservation here */
72 + /*
73 + * XXX: __block_prepare_write() unmaps passed block,
74 + * is it OK?
75 + */
76 + map_bh(bh_result, inode->i_sb, 0);
77 + set_buffer_new(bh_result);
78 + set_buffer_delay(bh_result);
79 + }
80 + ret = 0;
81 + }
83 + return ret;
86 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
87 + struct buffer_head *bh_result, int create)
89 + int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
90 + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
91 + loff_t disksize = EXT4_I(inode)->i_disksize;
92 + handle_t *handle = NULL;
94 + if (create) {
95 + handle = ext4_journal_start(inode, needed_blocks);
96 + if (IS_ERR(handle)) {
97 + ret = PTR_ERR(handle);
98 + goto out;
99 + }
102 + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
103 + bh_result, create, 0);
104 + if (ret > 0) {
105 + bh_result->b_size = (ret << inode->i_blkbits);
107 + /*
108 + * Update on-disk size along with block allocation
109 + * we don't use 'extend_disksize' as size may change
110 + * within already allocated block -bzzz
111 + */
112 + disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
113 + if (disksize > i_size_read(inode))
114 + disksize = i_size_read(inode);
115 + if (disksize > EXT4_I(inode)->i_disksize) {
116 + /*
117 + * XXX: replace with spinlock if seen contended -bzzz
118 + */
119 + down_write(&EXT4_I(inode)->i_data_sem);
120 + if (disksize > EXT4_I(inode)->i_disksize)
121 + EXT4_I(inode)->i_disksize = disksize;
122 + up_write(&EXT4_I(inode)->i_data_sem);
124 + if (EXT4_I(inode)->i_disksize == disksize) {
125 + if (handle == NULL)
126 + handle = ext4_journal_start(inode, 1);
127 + if (!IS_ERR(handle))
128 + ext4_mark_inode_dirty(handle, inode);
132 + ret = 0;
135 +out:
136 + if (handle && !IS_ERR(handle))
137 + ext4_journal_stop(handle);
139 + return ret;
142 +static int ext4_da_writepages(struct address_space *mapping,
143 + struct writeback_control *wbc)
145 + return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
148 +static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
149 + loff_t pos, unsigned len, unsigned flags,
150 + struct page **pagep, void **fsdata)
152 + int ret;
153 + struct page *page;
154 + pgoff_t index;
155 + unsigned from, to;
157 + index = pos >> PAGE_CACHE_SHIFT;
158 + from = pos & (PAGE_CACHE_SIZE - 1);
159 + to = from + len;
161 + page = __grab_cache_page(mapping, index);
162 + if (!page)
163 + return -ENOMEM;
164 + *pagep = page;
166 + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
167 + ext4_da_get_block_prep);
168 + return ret;
171 +static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
173 + struct buffer_head *head, *bh;
174 + unsigned int curr_off = 0;
176 + /*
177 + * Drop reserved blocks
178 + */
179 + BUG_ON(!PageLocked(page));
180 + if (!page_has_buffers(page))
181 + goto out;
183 + head = page_buffers(page);
184 + bh = head;
185 + do {
186 + unsigned int next_off = curr_off + bh->b_size;
188 + /*
189 + * is this block fully invalidated?
190 + */
191 + if (offset <= curr_off && buffer_delay(bh)) {
192 + clear_buffer_delay(bh);
193 + /* XXX: add real stuff here */
195 + curr_off = next_off;
196 + bh = bh->b_this_page;
197 + } while (bh != head);
199 +out:
200 + ext4_invalidatepage(page, offset);
202 + return;
207 * bmap() is special. It gets used by applications such as lilo and by
208 * the swapper to find the on-disk block of a specific piece of data.
210 @@ -1893,10 +2051,28 @@ static const struct address_space_operat
211 .releasepage = ext4_releasepage,
214 +static const struct address_space_operations ext4_da_aops = {
215 + .readpage = ext4_readpage,
216 + .readpages = ext4_readpages,
217 + .writepage = ext4_writeback_writepage,
218 + .writepages = ext4_da_writepages,
219 + .sync_page = block_sync_page,
220 + .write_begin = ext4_da_write_begin,
221 + .write_end = generic_write_end,
222 + .bmap = ext4_bmap,
223 + .invalidatepage = ext4_da_invalidatepage,
224 + .releasepage = ext4_releasepage,
225 + .direct_IO = ext4_direct_IO,
226 + .migratepage = buffer_migrate_page,
229 void ext4_set_aops(struct inode *inode)
231 if (ext4_should_order_data(inode))
232 inode->i_mapping->a_ops = &ext4_ordered_aops;
233 + else if (ext4_should_writeback_data(inode) &&
234 + test_opt(inode->i_sb, DELALLOC))
235 + inode->i_mapping->a_ops = &ext4_da_aops;
236 else if (ext4_should_writeback_data(inode))
237 inode->i_mapping->a_ops = &ext4_writeback_aops;
238 else
239 Index: linux-2.6.26-rc4/fs/ext4/super.c
240 ===================================================================
241 --- linux-2.6.26-rc4.orig/fs/ext4/super.c 2008-05-29 10:52:38.000000000 -0700
242 +++ linux-2.6.26-rc4/fs/ext4/super.c 2008-05-29 10:52:39.000000000 -0700
243 @@ -887,7 +887,7 @@ enum {
244 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
245 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
246 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
247 - Opt_mballoc, Opt_nomballoc, Opt_stripe,
248 + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
251 static match_table_t tokens = {
252 @@ -946,6 +946,7 @@ static match_table_t tokens = {
253 {Opt_nomballoc, "nomballoc"},
254 {Opt_stripe, "stripe=%u"},
255 {Opt_resize, "resize"},
256 + {Opt_delalloc, "delalloc"},
257 {Opt_err, NULL},
260 @@ -1324,6 +1325,9 @@ set_qf_format:
261 return 0;
262 sbi->s_stripe = option;
263 break;
264 + case Opt_delalloc:
265 + set_opt(sbi->s_mount_opt, DELALLOC);
266 + break;
267 default:
268 printk (KERN_ERR
269 "EXT4-fs: Unrecognized mount option \"%s\" "
270 Index: linux-2.6.26-rc4/fs/ext4/ext4.h
271 ===================================================================
272 --- linux-2.6.26-rc4.orig/fs/ext4/ext4.h 2008-05-29 10:52:38.000000000 -0700
273 +++ linux-2.6.26-rc4/fs/ext4/ext4.h 2008-05-29 10:52:39.000000000 -0700
274 @@ -536,6 +536,7 @@ do { \
275 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
276 #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
277 #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
278 +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
279 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
280 #ifndef _LINUX_EXT2_FS_H
281 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt