Rebase to 2.6.24-rc8
[ext4-patch-queue.git] / ext4-delayed-allocation.patch
blob1a447dacb2ff6dc93b4ed0968bd476c5bba45ada
1 ext4: [RFC] delayed allocation for ext4
3 From: Alex Tomas <alex@clusterfs.com>
5 delayed allocation itself, enabled by "delalloc" mount option.
6 extents support is also required. currently it works only
7 with blocksize=pagesize.
9 Shaggy fixed a lot of endian errors reported by sparse
11 Signed-off-by: Alex Tomas <alex@clusterfs.com>
12 ---
14 fs/ext4/Makefile | 2
15 fs/ext4/extents.c | 30 +
16 fs/ext4/file.c | 4
17 fs/ext4/inode.c | 34 +
18 fs/ext4/super.c | 14
19 fs/ext4/writeback.c | 1185 ++++++++++++++++++++++++++++++++++++++++
20 include/linux/ext4_fs.h | 17
21 include/linux/ext4_fs_extents.h | 1
22 include/linux/ext4_fs_i.h | 4
23 include/linux/ext4_fs_sb.h | 11
24 10 files changed, 1297 insertions(+), 5 deletions(-)
26 Index: linux-2.6.23-rc2/fs/ext4/Makefile
27 ===================================================================
28 --- linux-2.6.23-rc2.orig/fs/ext4/Makefile 2007-08-06 22:18:09.000000000 -0700
29 +++ linux-2.6.23-rc2/fs/ext4/Makefile 2007-08-06 22:19:05.000000000 -0700
30 @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
32 ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
33 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
34 - ext4_jbd2.o
35 + ext4_jbd2.o writeback.o
37 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
38 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
39 Index: linux-2.6.23-rc2/fs/ext4/extents.c
40 ===================================================================
41 --- linux-2.6.23-rc2.orig/fs/ext4/extents.c 2007-08-06 22:18:09.000000000 -0700
42 +++ linux-2.6.23-rc2/fs/ext4/extents.c 2007-08-06 22:19:05.000000000 -0700
43 @@ -2508,6 +2508,36 @@ int ext4_ext_writepage_trans_blocks(stru
44 return needed;
47 +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
49 + int lcap, icap, rcap, leafs, idxs, num;
51 + rcap = ext4_ext_space_root(inode);
52 + if (blocks <= rcap) {
53 + /* all extents fit to the root */
54 + return 0;
55 + }
57 + rcap = ext4_ext_space_root_idx(inode);
58 + lcap = ext4_ext_space_block(inode);
59 + icap = ext4_ext_space_block_idx(inode);
61 + num = leafs = (blocks + lcap - 1) / lcap;
62 + if (leafs <= rcap) {
63 + /* all pointers to leafs fit to the root */
64 + return leafs;
65 + }
67 + /* ok. we need separate index block(s) to link all leaf blocks */
68 + idxs = (leafs + icap - 1) / icap;
69 + do {
70 + num += idxs;
71 + idxs = (idxs + icap - 1) / icap;
72 + } while (idxs > rcap);
74 + return num;
78 * preallocate space for a file. This implements ext4's fallocate inode
79 * operation, which gets called from sys_fallocate system call.
80 Index: linux-2.6.23-rc2/fs/ext4/file.c
81 ===================================================================
82 --- linux-2.6.23-rc2.orig/fs/ext4/file.c 2007-08-06 22:18:09.000000000 -0700
83 +++ linux-2.6.23-rc2/fs/ext4/file.c 2007-08-06 22:19:05.000000000 -0700
84 @@ -35,8 +35,8 @@ static int ext4_release_file (struct ino
86 /* if we are the last writer on the inode, drop the block reservation */
87 if ((filp->f_mode & FMODE_WRITE) &&
88 - (atomic_read(&inode->i_writecount) == 1))
89 - {
90 + (atomic_read(&inode->i_writecount) == 1) &&
91 + EXT4_I(inode)->i_blocks_reserved == 0) {
92 mutex_lock(&EXT4_I(inode)->truncate_mutex);
93 ext4_discard_reservation(inode);
94 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
95 Index: linux-2.6.23-rc2/fs/ext4/inode.c
96 ===================================================================
97 --- linux-2.6.23-rc2.orig/fs/ext4/inode.c 2007-08-06 22:18:49.000000000 -0700
98 +++ linux-2.6.23-rc2/fs/ext4/inode.c 2007-08-06 22:19:05.000000000 -0700
99 @@ -942,7 +942,7 @@ out:
101 #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
103 -static int ext4_get_block(struct inode *inode, sector_t iblock,
104 +int ext4_get_block(struct inode *inode, sector_t iblock,
105 struct buffer_head *bh_result, int create)
107 handle_t *handle = ext4_journal_current_handle();
108 @@ -1741,9 +1741,34 @@ static const struct address_space_operat
109 .releasepage = ext4_releasepage,
112 +static int ext4_wb_set_page_dirty(struct page *page)
114 + return __set_page_dirty_nobuffers(page);
117 +static struct address_space_operations ext4_writeback_da_aops = {
118 + .readpage = ext4_readpage,
119 + .readpages = ext4_readpages,
120 + .writepage = ext4_wb_writepage,
121 + .writepages = ext4_wb_writepages,
122 + .sync_page = block_sync_page,
123 + .prepare_write = ext4_wb_prepare_write,
124 + .commit_write = ext4_wb_commit_write,
125 + .bmap = ext4_bmap,
126 + .invalidatepage = ext4_wb_invalidatepage,
127 + .releasepage = ext4_wb_releasepage,
128 + .set_page_dirty = ext4_wb_set_page_dirty,
129 + .direct_IO = ext4_direct_IO,
132 void ext4_set_aops(struct inode *inode)
134 - if (ext4_should_order_data(inode))
135 + if (S_ISREG(inode->i_mode) &&
136 + (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
137 + test_opt(inode->i_sb, EXTENTS) &&
138 + test_opt(inode->i_sb, DELAYED_ALLOC))
139 + inode->i_mapping->a_ops = &ext4_writeback_da_aops;
140 + else if (ext4_should_order_data(inode))
141 inode->i_mapping->a_ops = &ext4_ordered_aops;
142 else if (ext4_should_writeback_data(inode))
143 inode->i_mapping->a_ops = &ext4_writeback_aops;
144 @@ -1767,6 +1792,11 @@ int ext4_block_truncate_page(handle_t *h
145 struct buffer_head *bh;
146 int err = 0;
148 + if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
149 + test_opt(inode->i_sb, EXTENTS) &&
150 + test_opt(inode->i_sb, DELAYED_ALLOC))
151 + return ext4_wb_block_truncate_page(handle, page, mapping, from);
153 blocksize = inode->i_sb->s_blocksize;
154 length = blocksize - (offset & (blocksize - 1));
155 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
156 Index: linux-2.6.23-rc2/fs/ext4/super.c
157 ===================================================================
158 --- linux-2.6.23-rc2.orig/fs/ext4/super.c 2007-08-06 22:18:59.000000000 -0700
159 +++ linux-2.6.23-rc2/fs/ext4/super.c 2007-08-06 22:19:05.000000000 -0700
160 @@ -441,6 +441,7 @@ static void ext4_put_super (struct super
161 struct ext4_super_block *es = sbi->s_es;
162 int i;
164 + ext4_wb_release(sb);
165 ext4_reserve_release(sb);
166 ext4_ext_release(sb);
167 ext4_xattr_put_super(sb);
168 @@ -508,6 +509,13 @@ static struct inode *ext4_alloc_inode(st
169 ei->i_block_alloc_info = NULL;
170 ei->vfs_inode.i_version = 1;
171 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
173 + /* FIXME: these wb-related fields could be initialized once */
174 + ei->i_blocks_reserved = 0;
175 + ei->i_md_reserved = 0;
176 + atomic_set(&ei->i_wb_writers, 0);
177 + spin_lock_init(&ei->i_wb_reserved_lock);
179 return &ei->vfs_inode;
182 @@ -738,6 +746,7 @@ enum {
183 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
184 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
185 Opt_grpquota, Opt_extents, Opt_noextents,
186 + Opt_delayed_alloc,
189 static match_table_t tokens = {
190 @@ -791,6 +800,7 @@ static match_table_t tokens = {
191 {Opt_barrier, "barrier=%u"},
192 {Opt_extents, "extents"},
193 {Opt_noextents, "noextents"},
194 + {Opt_delayed_alloc, "delalloc"},
195 {Opt_err, NULL},
196 {Opt_resize, "resize"},
198 @@ -1112,6 +1122,9 @@ clear_qf_name:
199 else
200 clear_opt(sbi->s_mount_opt, BARRIER);
201 break;
202 + case Opt_delayed_alloc:
203 + set_opt(sbi->s_mount_opt, DELAYED_ALLOC);
204 + break;
205 case Opt_ignore:
206 break;
207 case Opt_resize:
208 @@ -1950,6 +1963,7 @@ static int ext4_fill_super (struct super
210 ext4_ext_init(sb);
211 ext4_reserve_init(sb);
212 + ext4_wb_init(sb);
214 lock_kernel();
215 return 0;
216 Index: linux-2.6.23-rc2/fs/ext4/writeback.c
217 ===================================================================
218 --- /dev/null 1970-01-01 00:00:00.000000000 +0000
219 +++ linux-2.6.23-rc2/fs/ext4/writeback.c 2007-08-06 22:19:05.000000000 -0700
220 @@ -0,0 +1,1185 @@
222 + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
223 + * Written by Alex Tomas <alex@clusterfs.com>
225 + * This program is free software; you can redistribute it and/or modify
226 + * it under the terms of the GNU General Public License version 2 as
227 + * published by the Free Software Foundation.
229 + * This program is distributed in the hope that it will be useful,
230 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
231 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
232 + * GNU General Public License for more details.
234 + * You should have received a copy of the GNU General Public Licens
235 + * along with this program; if not, write to the Free Software
236 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
237 + */
240 + * TODO:
241 + * MUST:
242 + * - flush dirty pages in -ENOSPC case in order to free reserved blocks
243 + * - direct I/O support
244 + * - blocksize != PAGE_CACHE_SIZE support
245 + * - store last unwritten page in ext4_wb_writepages() and
246 + * continue from it in a next run
247 + * WISH:
248 + * - should ext4_wb_writepage() try to flush neighbours?
249 + * - ext4_wb_block_truncate_page() must flush partial truncated pages
250 + * - reservation can be done per write-request in ext4_file_write()
251 + * rather than per-page in ext4_wb_commit_write() -- it's quite
252 + * expensive to recalculate amount of required metadata for evey page
253 + * - re-allocation to improve layout
254 + */
256 +#include <linux/module.h>
257 +#include <linux/fs.h>
258 +#include <linux/bio.h>
259 +#include <linux/time.h>
260 +#include <linux/ext4_jbd2.h>
261 +#include <linux/jbd2.h>
262 +#include <linux/ext4_fs_extents.h>
263 +#include <linux/smp_lock.h>
264 +#include <linux/highuid.h>
265 +#include <linux/pagemap.h>
266 +#include <linux/quotaops.h>
267 +#include <linux/string.h>
268 +#include <linux/buffer_head.h>
269 +#include <linux/writeback.h>
270 +#include <linux/mpage.h>
271 +#include <linux/pagevec.h>
272 +#include <linux/backing-dev.h>
273 +#include <linux/spinlock.h>
276 + * If EXT4_WB_STATS is defined, then some stats are collected.
277 + * It will be showed upont umount time.
278 + */
279 +#define EXT4_WB_STATS
283 + * With EXT4_WB_SKIP_SMALL defined the patch will try to avoid
284 + * small I/Os ignoring ->writepages() if mapping hasn't enough
285 + * contig. dirty pages
286 + */
287 +#define EXT4_WB_SKIP_SMALL__
289 +#define WB_ASSERT(__x__) if (!(__x__)) BUG();
291 +#define WB_DEBUG__
292 +#ifdef WB_DEBUG
293 +#define wb_debug(fmt, a...) printk(fmt, ##a);
294 +#else
295 +#define wb_debug(fmt, a...)
296 +#endif
298 +#define WB_MAX_PAGES_PER_EXTENT 32768
300 +#define WB_PAGES_PER_ARRAY 60
302 +struct ext4_wb_pages {
303 + struct list_head list;
304 + struct page *pages[WB_PAGES_PER_ARRAY];
305 + unsigned short num, start;
308 +struct ext4_wb_control {
309 + pgoff_t start;
310 + int len, extents;
311 + int blocks_to_release;
312 + struct ext4_wb_pages *pages;
313 + struct list_head list;
314 + struct address_space *mapping;
318 +void ext4_wb_invalidatepage(struct page *, unsigned long);
319 +int ext4_get_block(struct inode *inode, sector_t iblock,
320 + struct buffer_head *bh_result, int create);
323 +static struct page *ext4_wb_pull_page(struct ext4_wb_control *wc)
325 + struct ext4_wb_pages *wp = wc->pages;
327 + BUG_ON(wp == NULL);
328 + BUG_ON(list_empty(&wc->list));
329 + BUG_ON(list_empty(&wp->list));
330 + if (wp->start == wp->num) {
331 + list_del(&wp->list);
332 + kfree(wp);
333 + if (list_empty(&wc->list))
334 + return NULL;
335 + wp = list_entry(wc->list.next, struct ext4_wb_pages, list);
336 + wc->pages = wp;
338 + BUG_ON(list_empty(&wp->list));
339 + return wp->pages[wp->start++];
342 +static struct bio *ext4_wb_bio_alloc(struct inode *inode,
343 + sector_t first_block, int nr_vecs)
345 + gfp_t gfp_flags = GFP_NOFS | __GFP_HIGH;
346 + struct bio *bio;
347 + int maxreq;
349 + maxreq = bio_get_nr_vecs(inode->i_sb->s_bdev);
350 + if (maxreq < nr_vecs)
351 + nr_vecs = maxreq;
353 + bio = bio_alloc(gfp_flags, nr_vecs);
355 + if (bio == NULL && (current->flags & PF_MEMALLOC)) {
356 + while (!bio && (nr_vecs /= 2))
357 + bio = bio_alloc(gfp_flags, nr_vecs);
360 + if (bio) {
361 + bio->bi_bdev = inode->i_sb->s_bdev;
362 + bio->bi_sector = first_block << (inode->i_blkbits - 9);
364 + return bio;
367 +static int ext4_wb_end_io(struct bio *bio, unsigned int bytes, int err)
369 + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
370 + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
372 + if (bio->bi_size)
373 + return 1;
375 + do {
376 + struct page *page = bvec->bv_page;
378 + if (--bvec >= bio->bi_io_vec)
379 + prefetchw(&bvec->bv_page->flags);
381 + if (!uptodate)
382 + SetPageError(page);
383 + end_page_writeback(page);
384 + } while (bvec >= bio->bi_io_vec);
385 + bio_put(bio);
386 + return 0;
389 +static struct bio *ext4_wb_bio_submit(struct bio *bio, handle_t *handle)
391 + bio->bi_end_io = ext4_wb_end_io;
392 + submit_bio(WRITE, bio);
393 + return NULL;
396 +int inline ext4_wb_reserve_space_page(struct page *page, int blocks)
398 + struct inode *inode = page->mapping->host;
399 + int total, mdb, err;
401 + wb_debug("reserve %d blocks for page %lu from inode %lu\n",
402 + blocks, page->index, inode->i_ino);
404 + /* user wants us to reserve blocks for his file. reserving space
405 + * for his (data) blocks isn't enough because adding block may
406 + * involve allocation index/leaf blocks for tree/blockmap.
407 + * so, we need to calculate numbers of needed metadata for worst
408 + * case: block per extent */
410 + spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
411 + total = EXT4_I(inode)->i_blocks_reserved + blocks;
412 + mdb = ext4_ext_calc_metadata_amount(inode, total);
414 + /* if blockmap needs more metadata, we have to reserve difference */
415 + BUG_ON(mdb < EXT4_I(inode)->i_md_reserved);
416 + mdb = mdb - EXT4_I(inode)->i_md_reserved;
418 + err = ext4_reserve_blocks(inode->i_sb, mdb + blocks);
419 + if (err) {
420 + /* blocks are exhausted? */
421 + spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
422 + return err;
425 + /* blocks have been reserved, account this. I believe
426 + * inode's fields are protected by inode->i_sem */
427 + EXT4_I(inode)->i_blocks_reserved += blocks;
428 + EXT4_I(inode)->i_md_reserved += mdb;
429 + spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
431 + /* we have reserved space on a disk for the page */
432 + SetPageBooked(page);
433 + return 0;
437 + * release space reserved for @blocks of data
438 + * @used signals that @blocks got really allocated and we just
439 + * need to release corresponded over-reserved metadata
440 + */
441 +int inline ext4_wb_release_space(struct inode *inode, int blocks, int used)
443 + int total, mdb, release;
445 + spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
447 + total = EXT4_I(inode)->i_blocks_reserved - blocks;
448 + mdb = ext4_ext_calc_metadata_amount(inode, total);
450 + /* if blockmap needs lesser metadata, we may release difference */
451 + BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
452 + mdb = EXT4_I(inode)->i_md_reserved - mdb;
454 + release = mdb;
455 + /* drop reservation only for non-used blocks */
456 + if (!used)
457 + release += blocks;
458 + wb_debug("%u %s: release %d/%d blocks from %u/%u reserved"
459 + " for inode %lu\n", blocks,
460 + used ? "allocated" : "dropped", used ? 0 : blocks,
461 + mdb, EXT4_I(inode)->i_blocks_reserved,
462 + EXT4_I(inode)->i_md_reserved, inode->i_ino);
463 + if (release)
464 + ext4_release_blocks(inode->i_sb, release);
466 + /* update per-inode reservations */
467 + BUG_ON(blocks > EXT4_I(inode)->i_blocks_reserved);
468 + EXT4_I(inode)->i_blocks_reserved -= blocks;
469 + BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
470 + EXT4_I(inode)->i_md_reserved -= mdb;
472 + spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
474 + return 0;
477 +static inline int ext4_wb_drop_page_reservation(struct page *page)
479 + /* we just allocated blocks for this page. those blocks (and
480 + * probably metadata for them) were reserved before. now we
481 + * should drop reservation mark from the page. if we didn't
482 + * do that then ->invalidatepage() may think page still holds
483 + * reserved blocks. we could release reserved blocks right
484 + * now, but I'd prefer to make this once per several blocks */
485 + wb_debug("drop reservation from page %lu from inode %lu\n",
486 + page->index, page->mapping->host->i_ino);
487 + BUG_ON(!PageBooked(page));
488 + ClearPageBooked(page);
489 + return 0;
492 +static int ext4_wb_submit_extent(struct ext4_wb_control *wc, handle_t *handle,
493 + struct ext4_extent *ex, int new)
495 + struct inode *inode = wc->mapping->host;
496 + int blkbits = inode->i_blkbits;
497 + struct page *page;
498 + unsigned long blk, off, len, remain;
499 + unsigned long pstart, plen, prev;
500 + struct bio *bio = NULL;
501 + int nr_pages;
503 + /*
504 + * we have list of pages in wc and block numbers in ex
505 + * let's cook bios from them and start real I/O
506 + */
508 + BUG_ON(PAGE_CACHE_SHIFT < blkbits);
509 + BUG_ON(list_empty(&wc->list));
511 + wb_debug("cook and submit bios for %u/%u/%u for %lu/%u\n",
512 + le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len),
513 + le32_to_cpu(ex->ee_start), wc->start, wc->len);
515 + blk = le32_to_cpu(ex->ee_block);
516 + remain = le16_to_cpu(ex->ee_len);
517 + wc->extents++;
519 + while (remain) {
520 + page = ext4_wb_pull_page(wc);
521 + if (page == NULL)
522 + break;
524 + pstart = page->index << (PAGE_CACHE_SHIFT - blkbits);
525 + plen = PAGE_SIZE >> blkbits;
526 + if (pstart > blk) {
527 + /* probably extent covers long space and page
528 + * to be written in the middle of it */
529 + BUG_ON(pstart - blk >= remain);
530 + remain -= pstart - blk;
531 + blk = pstart;
533 + BUG_ON(blk < pstart || blk >= pstart + plen);
535 + BUG_ON(!PageUptodate(page));
536 + /* page can get here via mmap(2)
537 + * BUG_ON(!PagePrivate(page));*/
538 + BUG_ON(new && PageMappedToDisk(page));
539 + BUG_ON(!new && !PageMappedToDisk(page));
540 + SetPageMappedToDisk(page);
541 + if (new && PagePrivate(page)) {
542 + /* space is just allocated and it was reserved in
543 + * ->commit_write(). time to release reservation.
544 + * space may not be reserved if page gets dirty
545 + * via mmap. should we reserve it in ->mmap() ? */
546 + prev = min(plen, remain);
547 + ext4_wb_drop_page_reservation(page);
548 + wc->blocks_to_release += prev;
551 +alloc_new_bio:
552 + if (bio == NULL) {
553 + /* +2 because head/tail may belong to different pages */
554 + nr_pages = (le16_to_cpu(ex->ee_len) -
555 + (blk - le32_to_cpu(ex->ee_block)));
556 + nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
557 + off = le32_to_cpu(ex->ee_start) +
558 + (blk - le32_to_cpu(ex->ee_block));
559 + bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
560 + if (bio == NULL)
561 + return -ENOMEM;
564 + off = (blk - pstart) << blkbits;
565 + prev = min(plen, remain);
566 + len = prev << blkbits;
567 + if (bio_add_page(bio, page, len, off) < len) {
568 + bio = ext4_wb_bio_submit(bio, handle);
569 + goto alloc_new_bio;
571 + remain -= prev;
572 + blk += prev;
573 + if (blk < pstart + plen) {
574 + /* extent covers part of the page only.
575 + * it's possible that next extent covers
576 + * the tail. so, we leave page */
577 + printk("blk %lu pstart %lu plen %lu remain"
578 + " %lu prev %lu\n",
579 + blk, pstart, plen, remain, prev);
580 + wc->pages->start--;
581 + BUG_ON(remain != 0);
584 + if (bio)
585 + ext4_wb_bio_submit(bio, handle);
586 + BUG_ON(new && remain != 0);
587 + return 0;
590 +static ext4_fsblk_t
591 +ext4_wb_find_goal(struct inode *inode, struct ext4_ext_path *path,
592 + ext4_fsblk_t block)
594 + struct ext4_inode_info *ei = EXT4_I(inode);
595 + ext4_fsblk_t bg_start;
596 + unsigned long colour;
597 + int depth;
599 + if (path) {
600 + struct ext4_extent *ex;
601 + depth = path->p_depth;
603 + /* try to predict block placement */
604 + ex = path[depth].p_ext;
605 + if (ex)
606 + return le32_to_cpu(ex->ee_start) +
607 + (block - le32_to_cpu(ex->ee_block));
609 + /* it looks index is empty
610 + * try to find starting from index itself */
611 + if (path[depth].p_bh)
612 + return path[depth].p_bh->b_blocknr;
615 + /* OK. use inode's group */
616 + bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
617 + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
618 + colour = (current->pid % 16) *
619 + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
620 + return bg_start + colour + block;
623 +static int ext4_wb_handle_extent(struct inode *inode,
624 + struct ext4_ext_path *path,
625 + struct ext4_ext_cache *ec,
626 + void *cbdata)
628 + struct ext4_wb_control *wc = cbdata;
629 + struct super_block *sb = inode->i_sb;
630 + ext4_fsblk_t goal, pblock;
631 + unsigned long tgen, count;
632 + struct ext4_extent nex;
633 + loff_t new_i_size;
634 + handle_t *handle;
635 + int i, err;
637 + if (ec->ec_type == EXT4_EXT_CACHE_EXTENT) {
638 + /*
639 + * The extent is already allocated. The only thing
640 + * we have to do is to flush correspondend pages.
641 + */
642 + wb_debug("extent %u/%u/%u exist\n",
643 + (unsigned) ec->ec_block,
644 + (unsigned) ec->ec_len,
645 + (unsigned) ec->ec_start);
646 + nex.ee_start = cpu_to_le32(ec->ec_start);
647 + nex.ee_block = cpu_to_le32(ec->ec_block);
648 + nex.ee_len = cpu_to_le16(ec->ec_len);
649 + err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
651 + /* correct on-disk size, if we grow within
652 + * already allocated block */
653 + new_i_size = (loff_t) le32_to_cpu(nex.ee_block) +
654 + le16_to_cpu(nex.ee_len);
655 + new_i_size = new_i_size << inode->i_blkbits;
656 + if (new_i_size > i_size_read(inode))
657 + new_i_size = i_size_read(inode);
658 + if (new_i_size > EXT4_I(inode)->i_disksize) {
659 + EXT4_I(inode)->i_disksize = new_i_size;
660 + mutex_unlock(&EXT4_I(inode)->truncate_mutex);
661 + ext4_dirty_inode(inode);
662 + mutex_lock(&EXT4_I(inode)->truncate_mutex);
664 + return err;
667 + wb_debug("extent %u/%u DOES NOT exist\n", ec->ec_block, ec->ec_len);
669 + /* space for some pages we want to flush hasn't allocated
670 + * yet. so, it's time to allocate space */
671 + tgen = EXT4_I(inode)->i_ext_generation;
672 + count = ext4_ext_calc_credits_for_insert(inode, path);
673 + mutex_unlock(&EXT4_I(inode)->truncate_mutex);
675 + handle = ext4_journal_start(inode,
676 + count + EXT4_DATA_TRANS_BLOCKS(sb) + 1);
677 + if (IS_ERR(handle)) {
678 + mutex_lock(&EXT4_I(inode)->truncate_mutex);
679 + return PTR_ERR(handle);
682 + /* FIXME: we could analyze current path and advice allocator
683 + * to find additional blocks if goal can't be allocated
684 + * this is for better interaction between extents and mballoc
685 + * plus this should improve overall performance */
687 + mutex_lock(&EXT4_I(inode)->truncate_mutex);
688 + if (tgen != EXT4_I(inode)->i_ext_generation) {
689 + /* the tree has changed. so path can be invalid at moment */
690 + ext4_journal_stop(handle);
691 + return EXT_REPEAT;
694 + goal = ext4_wb_find_goal(inode, path, ec->ec_block);
695 + count = ec->ec_len;
697 + /* if this is a tail of closed file, ask allocator don't preallocate */
698 + new_i_size = i_size_read(inode) + sb->s_blocksize - 1;
699 + new_i_size = new_i_size >> inode->i_blkbits;
700 + if (ec->ec_block + count == new_i_size &&
701 + !atomic_read(&inode->i_writecount)) {
702 + /* XXX: disable preallocation for tail */
705 + /* this is a hack to tell the allocator that blocks
706 + * we are going to allocated are already reserved */
707 + EXT4_I(inode)->i_state |= EXT4_STATE_BLOCKS_RESERVED;
708 + pblock = ext4_new_blocks(handle, inode, goal, &count, &err);
709 + EXT4_I(inode)->i_state &= ~EXT4_STATE_BLOCKS_RESERVED;
711 + if (!pblock)
712 + goto out;
714 + BUG_ON(count > ec->ec_len);
715 + BUG_ON(count == 0);
716 + wb_debug("allocated %llu/%lu for %lu (asked %u)\n",
717 + pblock, count, inode->i_ino, ec->ec_len);
719 + /* insert new extent */
720 + nex.ee_start = cpu_to_le32(pblock);
721 + nex.ee_start_hi = 0;
722 + nex.ee_len = cpu_to_le16(count);
723 + nex.ee_block = cpu_to_le32(ec->ec_block);
724 + err = ext4_ext_insert_extent(handle, inode, path, &nex);
725 + if (err)
726 + goto out;
728 + /*
729 + * Putting len of the actual extent we just inserted,
730 + * we are asking ext4_ext_walk_space() to continue
731 + * scaning after that block
732 + */
733 + ec->ec_len = le16_to_cpu(nex.ee_len);
734 + BUG_ON(nex.ee_len == 0);
736 +#ifdef EXT4_WB_STATS
737 + atomic_add(le16_to_cpu(nex.ee_len),
738 + &EXT4_SB(inode->i_sb)->s_wb_allocated);
739 +#endif
741 + wb_debug("inserted %lu/%lu/%lu for %lu (asked %u)\n",
742 + (unsigned long) le32_to_cpu(nex.ee_block),
743 + (unsigned long) le16_to_cpu(nex.ee_len),
744 + (unsigned long) le32_to_cpu(nex.ee_start),
745 + inode->i_ino, ec->ec_len);
747 + /*
748 + * Important! The nex can change after insert. So do not
749 + * use ec for following
750 + */
752 + /* block have been allocated for data, so time to drop dirty
753 + * in correspondend buffer_heads to prevent corruptions */
754 + for (i = 0; i < le16_to_cpu(nex.ee_len); i++)
755 + unmap_underlying_metadata(sb->s_bdev,
756 + le32_to_cpu(nex.ee_start) + i);
758 + /* correct on-disk inode size */
759 + if (le16_to_cpu(nex.ee_len) > 0) {
760 + new_i_size = (loff_t) le32_to_cpu(nex.ee_block) +
761 + le16_to_cpu(nex.ee_len);
762 + new_i_size = new_i_size << inode->i_blkbits;
763 + if (new_i_size > i_size_read(inode))
764 + new_i_size = i_size_read(inode);
765 + if (new_i_size > EXT4_I(inode)->i_disksize) {
766 + EXT4_I(inode)->i_disksize = new_i_size;
767 + err = ext4_mark_inode_dirty(handle, inode);
771 + if (ext4_should_order_data(inode))
772 + err = ext4_wb_submit_extent(wc, handle, &nex, 1);
773 + else
774 + err = ext4_wb_submit_extent(wc, NULL, &nex, 1);
776 + /* we don't want to recalculate needed reservation for
777 + * each page. we may do this for each new extent */
778 + ext4_wb_release_space(inode, wc->blocks_to_release, 1);
779 + wc->blocks_to_release = 0;
781 +out:
782 + ext4_journal_stop(handle);
783 + if (err)
784 + printk("EXT4-fs: writeback error = %d\n", err);
785 + return err;
788 +static int ext4_wb_flush(struct ext4_wb_control *wc)
790 + struct list_head *cur, *tmp;
791 + struct inode *inode;
792 + int err, num = 0;
794 + if (wc->len == 0)
795 + return 0;
797 + inode = wc->mapping->host;
798 + wb_debug("start flushing %lu/%u from inode %lu\n",
799 + wc->start, wc->len, inode->i_ino);
801 + wc->pages = list_entry(wc->list.next, struct ext4_wb_pages, list);
802 + wc->extents = 0;
804 + mutex_lock(&EXT4_I(inode)->truncate_mutex);
805 + /* FIXME: last page may be partial */
806 + err = ext4_ext_walk_space(inode, wc->start, wc->len,
807 + ext4_wb_handle_extent, wc);
808 + mutex_unlock(&EXT4_I(inode)->truncate_mutex);
810 + list_for_each_safe(cur, tmp, &wc->list) {
811 + struct ext4_wb_pages *wp;
812 + wp = list_entry(cur, struct ext4_wb_pages, list);
813 + if (err) {
814 + while (wp->start < wp->num) {
815 + struct page *page = wp->pages[wp->start];
816 + BUG_ON(!PageWriteback(page));
817 + end_page_writeback(page);
818 + __set_page_dirty_nobuffers(page);
819 + wp->start++;
821 + } else {
822 + BUG_ON(num != 0);
823 + BUG_ON(wp->start != wp->num - 1 &&
824 + wp->start != wp->num);
826 + list_del(&wp->list);
827 + kfree(wp);
828 + num++;
830 + wc->pages = NULL;
831 + wc->len = 0;
832 + wc->extents = 0;
834 + return err;
837 +static int ext4_wb_add_page(struct ext4_wb_control *wc, struct page *page)
839 + struct ext4_wb_pages *wp = wc->pages;
841 + if (wp == NULL || wp->num == WB_PAGES_PER_ARRAY) {
842 + wp = kmalloc(sizeof(struct ext4_wb_pages), GFP_NOFS);
843 + if (wp == NULL) {
844 + printk("no mem for ext4_wb_pages!\n");
845 + return -ENOMEM;
847 + wp->num = 0;
848 + wp->start = 0;
849 + list_add_tail(&wp->list, &wc->list);
850 + wc->pages = wp;
853 + wp->pages[wp->num] = page;
854 + wp->num++;
856 + return 0;
859 +static inline void
860 +ext4_wb_init_control(struct ext4_wb_control *wc, struct address_space *mapping)
862 + wc->mapping = mapping;
863 + wc->len = 0;
864 + wc->blocks_to_release = 0;
865 + INIT_LIST_HEAD(&wc->list);
866 + wc->pages = NULL;
869 +static inline int
870 +ext4_wb_can_merge(struct ext4_wb_control *wc, unsigned long next)
872 + if (wc->start + wc->len == next &&
873 + wc->len <= WB_MAX_PAGES_PER_EXTENT)
874 + return 1;
875 + return 0;
878 +int ext4_wb_writepages(struct address_space *mapping,
879 + struct writeback_control *wbc)
881 + struct backing_dev_info *bdi = mapping->backing_dev_info;
882 + struct inode *inode = mapping->host;
883 + int nr_pages, i, err = 0, done = 0;
884 + struct ext4_wb_control wc;
885 + struct pagevec pvec;
886 + pgoff_t index = 0;
887 + int written = 0;
888 + int extents = 0;
889 + pgoff_t pindex = 0;
891 + wb_debug("->writepages on inode %lu (%u reserved)\n",
892 + inode->i_ino, EXT4_I(inode)->i_blocks_reserved);
893 +#ifdef EXT4_WB_SKIP_SMALL
894 + if (wbc->nr_to_write <= 64 && wbc->sync_mode == WB_SYNC_NONE)
895 + return 0;
896 +#endif
897 + atomic_inc(&EXT4_I(inode)->i_wb_writers);
898 +#ifdef EXT4_WB_STATS
899 + atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_reqs);
900 + atomic_add(wbc->nr_to_write, &EXT4_SB(inode->i_sb)->s_wb_nr_to_write);
901 + if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
902 + atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions);
903 +#endif
905 + /* skip opened-for-write small files
906 + * XXX: what do we do if most of files hit the condition? */
907 + if (wbc->sync_mode == WB_SYNC_NONE &&
908 + atomic_read(&inode->i_writecount) &&
909 + i_size_read(inode) <= 64*1024) {
910 + return 0;
913 + ext4_wb_init_control(&wc, mapping);
915 + pagevec_init(&pvec, 0);
916 + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
917 + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
918 + for (i = 0; i < nr_pages; i++) {
919 + struct page *page = pvec.pages[i];
921 + lock_page(page);
923 + if (wbc->sync_mode != WB_SYNC_NONE)
924 + wait_on_page_writeback(page);
926 + if (page->mapping != mapping) {
927 + unlock_page(page);
928 + continue;
930 + if (PageWriteback(page)) {
931 + unlock_page(page);
932 + continue;
935 + if (wc.len && ext4_wb_can_merge(&wc, page->index) &&
936 + wbc->nr_to_write <= 0) {
937 + /*
938 + * If we already exhausted blocks we got
939 + * to write and new extent starts, stop
940 + * writeback
941 + */
942 + unlock_page(page);
943 + done = 1;
944 + break;
948 + if (!clear_page_dirty_for_io(page)) {
949 + unlock_page(page);
950 + continue;
953 + set_page_writeback(page);
954 + unlock_page(page);
956 + if (wc.len == 0) {
957 + wc.start = page->index;
958 + wc.len = 1;
959 + extents++;
960 + } else if (ext4_wb_can_merge(&wc, page->index)) {
961 + wc.len++;
962 + } else {
963 + /* end of current extent: flush it ... */
964 +#if 0
965 + if (wc.len < 64 && wc.len > 0) {
966 + printk("#%u: wow! short extent %d for flush on #%lu\n",
967 + (unsigned) current->pid, wc.len, inode->i_ino);
968 + printk("#%u: done = %d, nr_to_write %ld, sync = %d\n",
969 + (unsigned) current->pid, done, wbc->nr_to_write,
970 + wbc->sync_mode);
971 + printk("#%u: written %d, extents %d\n",
972 + (unsigned) current->pid, written, extents);
973 + printk("#%u: cur %lu, prev %lu\n",
974 + (unsigned) current->pid,
975 + (unsigned long) page->index,
976 + (unsigned long) pindex);
978 +#endif
979 + err = ext4_wb_flush(&wc);
980 + if (err) {
981 + done = 1;
982 + end_page_writeback(page);
983 + break;
986 + /* ... and start new one */
987 + BUG_ON(!PageWriteback(page));
988 + wc.start = page->index;
989 + wc.len = 1;
990 + extents++;
993 + pindex = page->index;
994 + err = ext4_wb_add_page(&wc, page);
995 + if (err) {
996 + done = 1;
997 + end_page_writeback(page);
998 + break;
1000 + written++;
1002 + wbc->nr_to_write--;
1003 +#if 0
1004 + if ((--(wbc->nr_to_write) <= 0))
1005 + done = 1;
1006 +#endif
1007 + if (wbc->nonblocking && bdi_write_congested(bdi)) {
1008 +#ifdef EXT4_WB_STATS
1009 + atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_congested);
1010 +#endif
1011 + wbc->encountered_congestion = 1;
1012 + done = 1;
1015 + pagevec_release(&pvec);
1017 + if (!err) {
1018 +#ifdef EXT4_WB_SKIP_SMALL
1019 + if (wc.len > 0 && wc.len < 64
1020 + && wbc->sync_mode == WB_SYNC_NONE) {
1021 + struct list_head *cur, *tmp;
1022 + list_for_each_safe(cur, tmp, &wc.list) {
1023 + struct ext4_wb_pages *wp;
1024 + wp = list_entry(cur, struct ext4_wb_pages,
1025 + list);
1026 + for (i = wp->start; i < wp->num; i++) {
1027 + struct page *page = wp->pages[i];
1028 + BUG_ON(!PageWriteback(page));
1029 + end_page_writeback(page);
1030 + __set_page_dirty_nobuffers(page);
1032 + wbc->nr_to_write += i;
1033 + list_del(&wp->list);
1034 + kfree(wp);
1036 + } else
1037 +#endif
1038 + ext4_wb_flush(&wc);
1041 + atomic_dec(&EXT4_I(inode)->i_wb_writers);
1043 +#ifdef EXT4_WB_STATS
1044 + atomic_add(written, &EXT4_SB(inode->i_sb)->s_wb_blocks);
1045 + atomic_add(extents, &EXT4_SB(inode->i_sb)->s_wb_extents);
1046 +#endif
1047 + return 0;
1050 +static void ext4_wb_clear_page(struct page *page, int from, int to)
1052 + void *kaddr;
1054 + if (to < PAGE_CACHE_SIZE || from > 0) {
1055 + kaddr = kmap_atomic(page, KM_USER0);
1056 + if (PAGE_CACHE_SIZE > to)
1057 + memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1058 + if (0 < from)
1059 + memset(kaddr, 0, from);
1060 + flush_dcache_page(page);
1061 + kunmap_atomic(kaddr, KM_USER0);
1065 +int ext4_wb_prepare_write(struct file *file, struct page *page,
1066 + unsigned from, unsigned to)
1068 + struct inode *inode = page->mapping->host;
1069 + struct buffer_head bh, *bhw = &bh;
1070 + int err = 0;
1072 + wb_debug("prepare page %lu (%u-%u) for inode %lu\n",
1073 + page->index, from, to, page->mapping->host->i_ino);
1075 + /* if page is uptodate this means that ->prepare_write() has
1076 + * been called on page before and page is mapped to disk or
1077 + * we did reservation. page is protected and nobody can
1078 + * access it. hence, it safe to use page->private to pass
1079 + * flag that ->commit_write() has to reserve blocks. because
1080 + * an error may occur after ->prepare_write() we should not
1081 + * reserve block here. it's better to do in ->commit_write()
1082 + * when we're sure page is to be written */
1083 + page->private = 0;
1084 + if (!PageUptodate(page)) {
1085 + /* first write to this page */
1086 + bh.b_state = 0;
1087 + err = ext4_get_block(inode, page->index, bhw, 0);
1088 + if (err)
1089 + return err;
1090 + if (!buffer_mapped(bhw)) {
1091 + /* this block isn't allocated yet, reserve space */
1092 + wb_debug("reserve space for new block\n");
1093 + page->private = 1;
1094 + ext4_wb_clear_page(page, from, to);
1095 + ClearPageMappedToDisk(page);
1096 + } else {
1097 + /* block is already mapped, so no need to reserve */
1098 + BUG_ON(PagePrivate(page));
1099 + if (to - from < PAGE_CACHE_SIZE) {
1100 + wb_debug("read block %u\n",
1101 + (unsigned) bhw->b_blocknr);
1102 + set_bh_page(bhw, page, 0);
1103 + bhw->b_this_page = NULL;
1104 + bhw->b_size = 1 << inode->i_blkbits;
1105 + atomic_set(&bhw->b_count, 1);
1106 + ll_rw_block(READ, 1, &bhw);
1107 + wait_on_buffer(bhw);
1108 + if (!buffer_uptodate(bhw))
1109 + return -EIO;
1111 + SetPageMappedToDisk(page);
1113 + } else if (!PageMappedToDisk(page) && !PagePrivate(page)) {
1114 + /* this page was a hole at time of mmap() calling
1115 + * now someone wants to modify it by sys_write() */
1116 + wb_debug("reserve block for hole\n");
1117 + page->private = 1;
1120 + return 0;
1123 +int ext4_wb_commit_write(struct file *file, struct page *page,
1124 + unsigned from, unsigned to)
1126 + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1127 + struct inode *inode = page->mapping->host;
1128 + int err = 0;
1130 + wb_debug("commit page %lu (%u-%u) for inode %lu\n",
1131 + page->index, from, to, inode->i_ino);
1133 + /* mark page private so that we get
1134 + * called to invalidate/release page */
1135 + SetPagePrivate(page);
1137 + if (!PageBooked(page) && !PageMappedToDisk(page)) {
1138 + /* ->prepare_write() observed that block for this
1139 + * page hasn't been allocated yet. there fore it
1140 + * asked to reserve block for later allocation */
1141 + BUG_ON(page->private == 0);
1142 + page->private = 0;
1143 + err = ext4_wb_reserve_space_page(page, 1);
1144 + if (err)
1145 + return err;
1148 + /* ok. block for this page is allocated already or it has
1149 + * been reserved succesfully. so, user may use it */
1150 + __set_page_dirty_nobuffers(page);
1152 + SetPageUptodate(page);
1154 + /* correct in-core size, on-disk size will
1155 + * be corrected upon allocation */
1156 + if (pos > inode->i_size) {
1157 + i_size_write(inode, pos);
1158 + mark_inode_dirty(inode);
1161 + return err;
1164 +int ext4_wb_write_single_page(struct page *page,
1165 + struct writeback_control *wbc)
1167 + struct inode *inode = page->mapping->host;
1168 + struct ext4_wb_control wc;
1169 + int err;
1171 + atomic_inc(&EXT4_I(inode)->i_wb_writers);
1173 +#ifdef EXT4_WB_STATS
1174 + atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_single_pages);
1175 + if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
1176 + atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions_sp);
1177 +#endif
1179 + ext4_wb_init_control(&wc, page->mapping);
1181 + BUG_ON(PageWriteback(page));
1182 + set_page_writeback(page);
1183 + unlock_page(page);
1185 + wc.start = page->index;
1186 + wc.len = 1;
1188 + err = ext4_wb_add_page(&wc, page);
1189 + if (err) {
1190 + printk(KERN_ERR "EXT4-fs: cant add page at %s:%d - %d\n",
1191 + __FILE__, __LINE__, err);
1192 + end_page_writeback(page);
1193 + return err;
1195 + err = ext4_wb_flush(&wc);
1196 + atomic_dec(&EXT4_I(inode)->i_wb_writers);
1198 + return err;
1201 +int ext4_wb_writepage(struct page *page, struct writeback_control *wbc)
1203 + struct inode *inode = page->mapping->host;
1204 + loff_t i_size = i_size_read(inode);
1205 + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
1206 + unsigned offset;
1207 + void *kaddr;
1209 + wb_debug("writepage %lu from inode %lu\n", page->index, inode->i_ino);
1211 + /*
1212 + * FIXME: just to play ...
1213 + * If another thread is writing inode's data and the page
1214 + * hasn't space on a disk yet, leave it for that thread
1215 + */
1216 +#if 1
1217 + if (atomic_read(&EXT4_I(page->mapping->host)->i_wb_writers)
1218 + && !PageMappedToDisk(page)) {
1219 + __set_page_dirty_nobuffers(page);
1220 + unlock_page(page);
1221 + return 0;
1223 +#endif
1225 + /* we give up here if we're reentered, because
1226 + * it might be for a different filesystem */
1227 + if (ext4_journal_current_handle()) {
1228 + __set_page_dirty_nobuffers(page);
1229 + unlock_page(page);
1230 + return 0;
1233 + /* Is the page fully inside i_size? */
1234 + if (page->index < end_index)
1235 + return ext4_wb_write_single_page(page, wbc);
1237 + /* Is the page fully outside i_size? (truncate in progress) */
1238 + offset = i_size & (PAGE_CACHE_SIZE-1);
1239 + if (page->index >= end_index + 1 || !offset) {
1240 + /*
1241 + * The page may have dirty, unmapped buffers. For example,
1242 + * they may have been added in ext4_writepage(). Make them
1243 + * freeable here, so the page does not leak.
1244 + */
1245 + ext4_wb_invalidatepage(page, 0);
1246 + unlock_page(page);
1247 + return 0; /* don't care */
1250 + /*
1251 + * The page straddles i_size. It must be zeroed out on each and every
1252 + * writepage invocation because it may be mmapped. "A file is mapped
1253 + * in multiples of the page size. For a file that is not a multiple of
1254 + * the page size, the remaining memory is zeroed when mapped, and
1255 + * writes to that region are not written out to the file."
1256 + */
1257 + kaddr = kmap_atomic(page, KM_USER0);
1258 + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1259 + flush_dcache_page(page);
1260 + kunmap_atomic(kaddr, KM_USER0);
1261 + return ext4_wb_write_single_page(page, wbc);
1264 +int ext4_wb_releasepage(struct page *page, gfp_t wait)
1266 + wb_debug("release %sM%sR page %lu from inode %lu (wait %d)\n",
1267 + PageMappedToDisk(page) ? "" : "!",
1268 + PageBooked(page) ? "" : "!",
1269 + page->index, page->mapping->host->i_ino, wait);
1271 + if (PageWriteback(page))
1272 + return 0;
1274 + if (PagePrivate(page))
1275 + ClearPagePrivate(page);
1276 + return 0;
1279 +void ext4_wb_invalidatepage(struct page *page, unsigned long offset)
1281 + struct inode *inode = page->mapping->host;
1282 + int ret = 0;
1284 + /* ->invalidatepage() is called when page is marked Private.
1285 + * for our page being Private mean that space has been
1286 + * reserved for this page and it is being truncated. so,
1287 + * it's time to drop reservation */
1288 + wb_debug("invalidate %sM%sR page %lu from inode %lu (offset %lu)\n",
1289 + PageMappedToDisk(page) ? "" : "!",
1290 + PageBooked(page) ? "" : "!",
1291 + page->index, inode->i_ino, offset);
1293 + if (offset == 0) {
1294 + if (PageBooked(page)) {
1295 + atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_dropped);
1296 + ext4_wb_release_space(inode, 1, 0);
1297 + ext4_wb_drop_page_reservation(page);
1299 + ret = try_to_release_page(page, 0);
1301 + return;
1304 +int ext4_wb_block_truncate_page(handle_t *handle, struct page *page,
1305 + struct address_space *mapping, loff_t from)
1307 + unsigned offset = from & (PAGE_CACHE_SIZE-1);
1308 + struct inode *inode = mapping->host;
1309 + struct buffer_head bh, *bhw = &bh;
1310 + unsigned blocksize, length;
1311 + void *kaddr;
1312 + int err = 0;
1314 + wb_debug("partial truncate from %lu on page %lu from inode %lu\n",
1315 + (unsigned long) from, page->index, inode->i_ino);
1317 + blocksize = inode->i_sb->s_blocksize;
1318 + length = blocksize - (offset & (blocksize - 1));
1320 + /* if page isn't uptodate we have to check has it assigned block
1321 + * if it has then that block is to be read before memset() */
1322 + if (!PageUptodate(page)) {
1323 + BUG_ON(PageMappedToDisk(page));
1324 + bh.b_state = 0;
1325 + err = ext4_get_block(inode, page->index, bhw, 0);
1326 + if (err)
1327 + goto err_out;
1328 + BUG_ON(buffer_new(bhw));
1329 + if (buffer_mapped(bhw)) {
1330 + /* time to retrieve data from a disk */
1331 + wb_debug("read block %u for part.trunc on %lu\n",
1332 + (unsigned) bhw->b_blocknr, page->index);
1333 + set_bh_page(bhw, page, 0);
1334 + bhw->b_this_page = NULL;
1335 + bhw->b_size = 1 << inode->i_blkbits;
1336 + atomic_set(&bhw->b_count, 1);
1337 + ll_rw_block(READ, 1, &bhw);
1338 + wait_on_buffer(bhw);
1339 + err = -EIO;
1340 + if (!buffer_uptodate(bhw))
1341 + goto err_out;
1342 + SetPageMappedToDisk(page);
1343 + } else {
1344 + wb_debug("zero page %lu (part.trunc)\n", page->index);
1345 + offset = 0;
1346 + length = blocksize;
1350 + kaddr = kmap_atomic(page, KM_USER0);
1351 + memset(kaddr + offset, 0, length);
1352 + flush_dcache_page(page);
1353 + kunmap_atomic(kaddr, KM_USER0);
1354 + SetPageUptodate(page);
1355 + __set_page_dirty_nobuffers(page);
1357 +err_out:
1358 + unlock_page(page);
1359 + page_cache_release(page);
1360 + return err;
1363 +void ext4_wb_init(struct super_block *sb)
1365 + if (!test_opt(sb, DELAYED_ALLOC))
1366 + return;
1368 + if (PAGE_CACHE_SHIFT != sb->s_blocksize_bits) {
1369 + printk(KERN_ERR "EXT4-fs: delayed allocation isn't"
1370 + "supported for PAGE_CACHE_SIZE != blocksize yet\n");
1371 + clear_opt (EXT4_SB(sb)->s_mount_opt, DELAYED_ALLOC);
1372 + return;
1374 + printk("EXT4-fs: delayed allocation enabled\n");
1377 +void ext4_wb_release(struct super_block *sb)
1379 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1381 + if (!test_opt(sb, DELAYED_ALLOC))
1382 + return;
1384 +#ifdef EXT4_WB_STATS
1385 + if (atomic_read(&sbi->s_wb_reqs) == 0)
1386 + return;
1388 + printk("EXT4-fs: writeback: %d blocks %d extents in %d reqs (%d ave)\n",
1389 + atomic_read(&sbi->s_wb_blocks),
1390 + atomic_read(&sbi->s_wb_extents),
1391 + atomic_read(&sbi->s_wb_reqs),
1392 + atomic_read(&sbi->s_wb_blocks) / atomic_read(&sbi->s_wb_reqs));
1393 + printk("EXT4-fs: writeback: %d nr_to_write, %d congestions, %d singles\n",
1394 + atomic_read(&sbi->s_wb_nr_to_write),
1395 + atomic_read(&sbi->s_wb_congested),
1396 + atomic_read(&sbi->s_wb_single_pages));
1397 + printk("EXT4-fs: writeback: %d collisions, %d single-page collisions\n",
1398 + atomic_read(&sbi->s_wb_collisions),
1399 + atomic_read(&sbi->s_wb_collisions_sp));
1400 + printk("EXT4-fs: writeback: %d allocated, %d dropped\n",
1401 + atomic_read(&sbi->s_wb_allocated),
1402 + atomic_read(&sbi->s_wb_dropped));
1403 +#endif
1406 Index: linux-2.6.23-rc2/include/linux/ext4_fs.h
1407 ===================================================================
1408 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs.h 2007-08-06 22:18:59.000000000 -0700
1409 +++ linux-2.6.23-rc2/include/linux/ext4_fs.h 2007-08-06 22:19:05.000000000 -0700
1410 @@ -488,6 +488,8 @@ do { \
1411 #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
1412 #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
1413 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
1414 +#define EXT4_MOUNT_DELAYED_ALLOC 0x2000000/* Delayed allocation support*/
1416 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
1417 #ifndef _LINUX_EXT2_FS_H
1418 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
1419 @@ -1101,6 +1103,21 @@ ext4_get_blocks_wrap(handle_t *handle, s
1423 +/* writeback.c */
1424 +extern int ext4_wb_writepages(struct address_space *,
1425 + struct writeback_control *);
1426 +extern int ext4_wb_prepare_write(struct file *file, struct page *page,
1427 + unsigned from, unsigned to);
1428 +extern int ext4_wb_commit_write(struct file *, struct page *, unsigned,
1429 + unsigned);
1430 +extern int ext4_wb_writepage(struct page *, struct writeback_control *);
1431 +extern void ext4_wb_invalidatepage(struct page *, unsigned long);
1432 +extern int ext4_wb_releasepage(struct page *, gfp_t);
1433 +extern int ext4_wb_block_truncate_page(handle_t *, struct page *,
1434 + struct address_space *, loff_t);
1435 +extern void ext4_wb_init(struct super_block *);
1436 +extern void ext4_wb_release(struct super_block *);
1438 #endif /* __KERNEL__ */
1440 #endif /* _LINUX_EXT4_FS_H */
1441 Index: linux-2.6.23-rc2/include/linux/ext4_fs_extents.h
1442 ===================================================================
1443 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs_extents.h 2007-08-06 22:18:09.000000000 -0700
1444 +++ linux-2.6.23-rc2/include/linux/ext4_fs_extents.h 2007-08-06 22:19:05.000000000 -0700
1445 @@ -235,6 +235,7 @@ extern unsigned int ext4_ext_check_overl
1446 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
1447 extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
1448 extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
1449 +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
1451 #endif /* _LINUX_EXT4_EXTENTS */
1453 Index: linux-2.6.23-rc2/include/linux/ext4_fs_i.h
1454 ===================================================================
1455 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs_i.h 2007-08-06 22:18:09.000000000 -0700
1456 +++ linux-2.6.23-rc2/include/linux/ext4_fs_i.h 2007-08-06 22:19:05.000000000 -0700
1457 @@ -158,6 +158,10 @@ struct ext4_inode_info {
1458 * struct timespec i_{a,c,m}time in the generic inode.
1460 struct timespec i_crtime;
1461 + __u32 i_blocks_reserved;
1462 + __u32 i_md_reserved;
1463 + spinlock_t i_wb_reserved_lock; /* to protect i_md_reserved */
1464 + atomic_t i_wb_writers;
1467 #endif /* _LINUX_EXT4_FS_I */
1468 Index: linux-2.6.23-rc2/include/linux/ext4_fs_sb.h
1469 ===================================================================
1470 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs_sb.h 2007-08-06 22:18:59.000000000 -0700
1471 +++ linux-2.6.23-rc2/include/linux/ext4_fs_sb.h 2007-08-06 22:19:05.000000000 -0700
1472 @@ -97,6 +97,17 @@ struct ext4_sb_info {
1473 unsigned long s_ext_blocks;
1474 unsigned long s_ext_extents;
1475 #endif
1477 + atomic_t s_wb_congested;
1478 + atomic_t s_wb_single_pages;
1479 + atomic_t s_wb_collisions_sp;
1480 + atomic_t s_wb_allocated;
1481 + atomic_t s_wb_reqs;
1482 + atomic_t s_wb_nr_to_write;
1483 + atomic_t s_wb_collisions;
1484 + atomic_t s_wb_blocks;
1485 + atomic_t s_wb_extents;
1486 + atomic_t s_wb_dropped;
1489 #endif /* _LINUX_EXT4_FS_SB */