ext4-delayed-allocation.patch

   1 ext4: [RFC] delayed allocation for ext4
   2
   3 From: Alex Tomas <alex@clusterfs.com>
   4
   5 delayed allocation itself, enabled by "delalloc" mount option.
   6 extents support is also required. currently it works only
   7 with blocksize=pagesize.
   8
   9 Shaggy fixed a lot of endian errors reported by sparse
  10
  11 Signed-off-by: Alex Tomas <alex@clusterfs.com>
  12 ---
  13
  14  fs/ext4/Makefile                |    2
  15  fs/ext4/extents.c               |   30 +
  16  fs/ext4/file.c                  |    4
  17  fs/ext4/inode.c                 |   34 +
  18  fs/ext4/super.c                 |   14
  19  fs/ext4/writeback.c             | 1185 ++++++++++++++++++++++++++++++++++++++++
  20  include/linux/ext4_fs.h         |   17
  21  include/linux/ext4_fs_extents.h |    1
  22  include/linux/ext4_fs_i.h       |    4
  23  include/linux/ext4_fs_sb.h      |   11
  24  10 files changed, 1297 insertions(+), 5 deletions(-)
  25
  26 Index: linux-2.6.23-rc2/fs/ext4/Makefile
  27 ===================================================================
  28 --- linux-2.6.23-rc2.orig/fs/ext4/Makefile      2007-08-06 22:18:09.000000000 -0700
  29 +++ linux-2.6.23-rc2/fs/ext4/Makefile   2007-08-06 22:19:05.000000000 -0700
  30 @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
  31
  32  ext4dev-y      := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
  33                    ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
  34 -                  ext4_jbd2.o
  35 +                  ext4_jbd2.o writeback.o
  36
  37  ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
  38  ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
  39 Index: linux-2.6.23-rc2/fs/ext4/extents.c
  40 ===================================================================
  41 --- linux-2.6.23-rc2.orig/fs/ext4/extents.c     2007-08-06 22:18:09.000000000 -0700
  42 +++ linux-2.6.23-rc2/fs/ext4/extents.c  2007-08-06 22:19:05.000000000 -0700
  43 @@ -2508,6 +2508,36 @@ int ext4_ext_writepage_trans_blocks(stru
  44         return needed;
  45  }
  46
  47 +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
  48 +{
  49 +       int lcap, icap, rcap, leafs, idxs, num;
  50 +
  51 +       rcap = ext4_ext_space_root(inode);
  52 +       if (blocks <= rcap) {
  53 +               /* all extents fit to the root */
  54 +               return 0;
  55 +       }
  56 +
  57 +       rcap = ext4_ext_space_root_idx(inode);
  58 +       lcap = ext4_ext_space_block(inode);
  59 +       icap = ext4_ext_space_block_idx(inode);
  60 +
  61 +       num = leafs = (blocks + lcap - 1) / lcap;
  62 +       if (leafs <= rcap) {
  63 +               /* all pointers to leafs fit to the root */
  64 +               return leafs;
  65 +       }
  66 +
  67 +       /* ok. we need separate index block(s) to link all leaf blocks */
  68 +       idxs = (leafs + icap - 1) / icap;
  69 +       do {
  70 +               num += idxs;
  71 +               idxs = (idxs + icap - 1) / icap;
  72 +       } while (idxs > rcap);
  73 +
  74 +       return num;
  75 +}
  76 +
  77  /*
  78   * preallocate space for a file. This implements ext4's fallocate inode
  79   * operation, which gets called from sys_fallocate system call.
  80 Index: linux-2.6.23-rc2/fs/ext4/file.c
  81 ===================================================================
  82 --- linux-2.6.23-rc2.orig/fs/ext4/file.c        2007-08-06 22:18:09.000000000 -0700
  83 +++ linux-2.6.23-rc2/fs/ext4/file.c     2007-08-06 22:19:05.000000000 -0700
  84 @@ -35,8 +35,8 @@ static int ext4_release_file (struct ino
  85  {
  86         /* if we are the last writer on the inode, drop the block reservation */
  87         if ((filp->f_mode & FMODE_WRITE) &&
  88 -                       (atomic_read(&inode->i_writecount) == 1))
  89 -       {
  90 +                       (atomic_read(&inode->i_writecount) == 1) &&
  91 +                       EXT4_I(inode)->i_blocks_reserved == 0) {
  92                 mutex_lock(&EXT4_I(inode)->truncate_mutex);
  93                 ext4_discard_reservation(inode);
  94                 mutex_unlock(&EXT4_I(inode)->truncate_mutex);
  95 Index: linux-2.6.23-rc2/fs/ext4/inode.c
  96 ===================================================================
  97 --- linux-2.6.23-rc2.orig/fs/ext4/inode.c       2007-08-06 22:18:49.000000000 -0700
  98 +++ linux-2.6.23-rc2/fs/ext4/inode.c    2007-08-06 22:19:05.000000000 -0700
  99 @@ -942,7 +942,7 @@ out:
 100
 101  #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
 102
 103 -static int ext4_get_block(struct inode *inode, sector_t iblock,
 104 +int ext4_get_block(struct inode *inode, sector_t iblock,
 105                         struct buffer_head *bh_result, int create)
 106  {
 107         handle_t *handle = ext4_journal_current_handle();
 108 @@ -1741,9 +1741,34 @@ static const struct address_space_operat
 109         .releasepage    = ext4_releasepage,
 110  };
 111
 112 +static int ext4_wb_set_page_dirty(struct page *page)
 113 +{
 114 +       return __set_page_dirty_nobuffers(page);
 115 +}
 116 +
 117 +static struct address_space_operations ext4_writeback_da_aops = {
 118 +       .readpage       = ext4_readpage,
 119 +       .readpages      = ext4_readpages,
 120 +       .writepage      = ext4_wb_writepage,
 121 +       .writepages     = ext4_wb_writepages,
 122 +       .sync_page      = block_sync_page,
 123 +       .prepare_write  = ext4_wb_prepare_write,
 124 +       .commit_write   = ext4_wb_commit_write,
 125 +       .bmap           = ext4_bmap,
 126 +       .invalidatepage = ext4_wb_invalidatepage,
 127 +       .releasepage    = ext4_wb_releasepage,
 128 +       .set_page_dirty = ext4_wb_set_page_dirty,
 129 +       .direct_IO      = ext4_direct_IO,
 130 +};
 131 +
 132  void ext4_set_aops(struct inode *inode)
 133  {
 134 -       if (ext4_should_order_data(inode))
 135 +       if (S_ISREG(inode->i_mode) &&
 136 +                       (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
 137 +                       test_opt(inode->i_sb, EXTENTS) &&
 138 +                       test_opt(inode->i_sb, DELAYED_ALLOC))
 139 +               inode->i_mapping->a_ops = &ext4_writeback_da_aops;
 140 +       else if (ext4_should_order_data(inode))
 141                 inode->i_mapping->a_ops = &ext4_ordered_aops;
 142         else if (ext4_should_writeback_data(inode))
 143                 inode->i_mapping->a_ops = &ext4_writeback_aops;
 144 @@ -1767,6 +1792,11 @@ int ext4_block_truncate_page(handle_t *h
 145         struct buffer_head *bh;
 146         int err = 0;
 147
 148 +       if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
 149 +                       test_opt(inode->i_sb, EXTENTS) &&
 150 +                       test_opt(inode->i_sb, DELAYED_ALLOC))
 151 +               return ext4_wb_block_truncate_page(handle, page, mapping, from);
 152 +
 153         blocksize = inode->i_sb->s_blocksize;
 154         length = blocksize - (offset & (blocksize - 1));
 155         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 156 Index: linux-2.6.23-rc2/fs/ext4/super.c
 157 ===================================================================
 158 --- linux-2.6.23-rc2.orig/fs/ext4/super.c       2007-08-06 22:18:59.000000000 -0700
 159 +++ linux-2.6.23-rc2/fs/ext4/super.c    2007-08-06 22:19:05.000000000 -0700
 160 @@ -441,6 +441,7 @@ static void ext4_put_super (struct super
 161         struct ext4_super_block *es = sbi->s_es;
 162         int i;
 163
 164 +       ext4_wb_release(sb);
 165         ext4_reserve_release(sb);
 166         ext4_ext_release(sb);
 167         ext4_xattr_put_super(sb);
 168 @@ -508,6 +509,13 @@ static struct inode *ext4_alloc_inode(st
 169         ei->i_block_alloc_info = NULL;
 170         ei->vfs_inode.i_version = 1;
 171         memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 172 +
 173 +       /* FIXME: these wb-related fields could be initialized once */
 174 +       ei->i_blocks_reserved = 0;
 175 +       ei->i_md_reserved = 0;
 176 +       atomic_set(&ei->i_wb_writers, 0);
 177 +       spin_lock_init(&ei->i_wb_reserved_lock);
 178 +
 179         return &ei->vfs_inode;
 180  }
 181
 182 @@ -738,6 +746,7 @@ enum {
 183         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 184         Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 185         Opt_grpquota, Opt_extents, Opt_noextents,
 186 +       Opt_delayed_alloc,
 187  };
 188
 189  static match_table_t tokens = {
 190 @@ -791,6 +800,7 @@ static match_table_t tokens = {
 191         {Opt_barrier, "barrier=%u"},
 192         {Opt_extents, "extents"},
 193         {Opt_noextents, "noextents"},
 194 +       {Opt_delayed_alloc, "delalloc"},
 195         {Opt_err, NULL},
 196         {Opt_resize, "resize"},
 197  };
 198 @@ -1112,6 +1122,9 @@ clear_qf_name:
 199                         else
 200                                 clear_opt(sbi->s_mount_opt, BARRIER);
 201                         break;
 202 +               case Opt_delayed_alloc:
 203 +                       set_opt(sbi->s_mount_opt, DELAYED_ALLOC);
 204 +                       break;
 205                 case Opt_ignore:
 206                         break;
 207                 case Opt_resize:
 208 @@ -1950,6 +1963,7 @@ static int ext4_fill_super (struct super
 209
 210         ext4_ext_init(sb);
 211         ext4_reserve_init(sb);
 212 +       ext4_wb_init(sb);
 213
 214         lock_kernel();
 215         return 0;
 216 Index: linux-2.6.23-rc2/fs/ext4/writeback.c
 217 ===================================================================
 218 --- /dev/null   1970-01-01 00:00:00.000000000 +0000
 219 +++ linux-2.6.23-rc2/fs/ext4/writeback.c        2007-08-06 22:19:05.000000000 -0700
 220 @@ -0,0 +1,1185 @@
 221 +/*
 222 + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 223 + * Written by Alex Tomas <alex@clusterfs.com>
 224 + *
 225 + * This program is free software; you can redistribute it and/or modify
 226 + * it under the terms of the GNU General Public License version 2 as
 227 + * published by the Free Software Foundation.
 228 + *
 229 + * This program is distributed in the hope that it will be useful,
 230 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 231 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 232 + * GNU General Public License for more details.
 233 + *
 234 + * You should have received a copy of the GNU General Public Licens
 235 + * along with this program; if not, write to the Free Software
 236 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
 237 + */
 238 +
 239 +/*
 240 + * TODO:
 241 + *   MUST:
 242 + *     - flush dirty pages in -ENOSPC case in order to free reserved blocks
 243 + *     - direct I/O support
 244 + *     - blocksize != PAGE_CACHE_SIZE support
 245 + *     - store last unwritten page in ext4_wb_writepages() and
 246 + *       continue from it in a next run
 247 + *   WISH:
 248 + *     - should ext4_wb_writepage() try to flush neighbours?
 249 + *     - ext4_wb_block_truncate_page() must flush partial truncated pages
 250 + *     - reservation can be done per write-request in ext4_file_write()
 251 + *       rather than per-page in ext4_wb_commit_write() -- it's quite
 252 + *       expensive to recalculate amount of required metadata for evey page
 253 + *     - re-allocation to improve layout
 254 + */
 255 +
 256 +#include <linux/module.h>
 257 +#include <linux/fs.h>
 258 +#include <linux/bio.h>
 259 +#include <linux/time.h>
 260 +#include <linux/ext4_jbd2.h>
 261 +#include <linux/jbd2.h>
 262 +#include <linux/ext4_fs_extents.h>
 263 +#include <linux/smp_lock.h>
 264 +#include <linux/highuid.h>
 265 +#include <linux/pagemap.h>
 266 +#include <linux/quotaops.h>
 267 +#include <linux/string.h>
 268 +#include <linux/buffer_head.h>
 269 +#include <linux/writeback.h>
 270 +#include <linux/mpage.h>
 271 +#include <linux/pagevec.h>
 272 +#include <linux/backing-dev.h>
 273 +#include <linux/spinlock.h>
 274 +
 275 +/*
 276 + * If EXT4_WB_STATS is defined, then some stats are collected.
 277 + * It will be showed upont umount time.
 278 + */
 279 +#define EXT4_WB_STATS
 280 +
 281 +
 282 +/*
 283 + * With EXT4_WB_SKIP_SMALL defined the patch will try to avoid
 284 + * small I/Os ignoring ->writepages() if mapping hasn't enough
 285 + * contig. dirty pages
 286 + */
 287 +#define EXT4_WB_SKIP_SMALL__
 288 +
 289 +#define WB_ASSERT(__x__) if (!(__x__)) BUG();
 290 +
 291 +#define WB_DEBUG__
 292 +#ifdef WB_DEBUG
 293 +#define wb_debug(fmt, a...)    printk(fmt, ##a);
 294 +#else
 295 +#define wb_debug(fmt, a...)
 296 +#endif
 297 +
 298 +#define WB_MAX_PAGES_PER_EXTENT        32768
 299 +
 300 +#define WB_PAGES_PER_ARRAY     60
 301 +
 302 +struct ext4_wb_pages {
 303 +       struct list_head list;
 304 +       struct page *pages[WB_PAGES_PER_ARRAY];
 305 +       unsigned short num, start;
 306 +};
 307 +
 308 +struct ext4_wb_control {
 309 +       pgoff_t start;
 310 +       int len, extents;
 311 +       int blocks_to_release;
 312 +       struct ext4_wb_pages *pages;
 313 +       struct list_head list;
 314 +       struct address_space *mapping;
 315 +};
 316 +
 317 +
 318 +void ext4_wb_invalidatepage(struct page *, unsigned long);
 319 +int ext4_get_block(struct inode *inode, sector_t iblock,
 320 +                       struct buffer_head *bh_result, int create);
 321 +
 322 +
 323 +static struct page *ext4_wb_pull_page(struct ext4_wb_control *wc)
 324 +{
 325 +       struct ext4_wb_pages *wp = wc->pages;
 326 +
 327 +       BUG_ON(wp == NULL);
 328 +       BUG_ON(list_empty(&wc->list));
 329 +       BUG_ON(list_empty(&wp->list));
 330 +       if (wp->start == wp->num) {
 331 +               list_del(&wp->list);
 332 +               kfree(wp);
 333 +               if (list_empty(&wc->list))
 334 +                       return NULL;
 335 +               wp = list_entry(wc->list.next, struct ext4_wb_pages, list);
 336 +               wc->pages = wp;
 337 +       }
 338 +       BUG_ON(list_empty(&wp->list));
 339 +       return wp->pages[wp->start++];
 340 +}
 341 +
 342 +static struct bio *ext4_wb_bio_alloc(struct inode *inode,
 343 +                                       sector_t first_block, int nr_vecs)
 344 +{
 345 +       gfp_t gfp_flags = GFP_NOFS | __GFP_HIGH;
 346 +       struct bio *bio;
 347 +       int maxreq;
 348 +
 349 +       maxreq = bio_get_nr_vecs(inode->i_sb->s_bdev);
 350 +       if (maxreq < nr_vecs)
 351 +               nr_vecs = maxreq;
 352 +
 353 +       bio = bio_alloc(gfp_flags, nr_vecs);
 354 +
 355 +       if (bio == NULL && (current->flags & PF_MEMALLOC)) {
 356 +               while (!bio && (nr_vecs /= 2))
 357 +                       bio = bio_alloc(gfp_flags, nr_vecs);
 358 +       }
 359 +
 360 +       if (bio) {
 361 +               bio->bi_bdev = inode->i_sb->s_bdev;
 362 +               bio->bi_sector = first_block << (inode->i_blkbits - 9);
 363 +       }
 364 +       return bio;
 365 +}
 366 +
 367 +static int ext4_wb_end_io(struct bio *bio, unsigned int bytes, int err)
 368 +{
 369 +       const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 370 +       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 371 +
 372 +       if (bio->bi_size)
 373 +               return 1;
 374 +
 375 +       do {
 376 +               struct page *page = bvec->bv_page;
 377 +
 378 +               if (--bvec >= bio->bi_io_vec)
 379 +                       prefetchw(&bvec->bv_page->flags);
 380 +
 381 +               if (!uptodate)
 382 +                       SetPageError(page);
 383 +               end_page_writeback(page);
 384 +       } while (bvec >= bio->bi_io_vec);
 385 +       bio_put(bio);
 386 +       return 0;
 387 +}
 388 +
 389 +static struct bio *ext4_wb_bio_submit(struct bio *bio, handle_t *handle)
 390 +{
 391 +       bio->bi_end_io = ext4_wb_end_io;
 392 +       submit_bio(WRITE, bio);
 393 +       return NULL;
 394 +}
 395 +
 396 +int inline ext4_wb_reserve_space_page(struct page *page, int blocks)
 397 +{
 398 +       struct inode *inode = page->mapping->host;
 399 +       int total, mdb, err;
 400 +
 401 +       wb_debug("reserve %d blocks for page %lu from inode %lu\n",
 402 +                       blocks, page->index, inode->i_ino);
 403 +
 404 +       /* user wants us to reserve blocks for his file. reserving space
 405 +        * for his (data) blocks isn't enough because adding block may
 406 +        * involve allocation index/leaf blocks for tree/blockmap.
 407 +        * so, we need to calculate numbers of needed metadata for worst
 408 +        * case: block per extent */
 409 +
 410 +       spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
 411 +       total = EXT4_I(inode)->i_blocks_reserved + blocks;
 412 +       mdb = ext4_ext_calc_metadata_amount(inode, total);
 413 +
 414 +       /* if blockmap needs more metadata, we have to reserve difference */
 415 +       BUG_ON(mdb < EXT4_I(inode)->i_md_reserved);
 416 +       mdb = mdb - EXT4_I(inode)->i_md_reserved;
 417 +
 418 +       err = ext4_reserve_blocks(inode->i_sb, mdb + blocks);
 419 +       if (err) {
 420 +               /* blocks are exhausted? */
 421 +               spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
 422 +               return err;
 423 +       }
 424 +
 425 +       /* blocks have been reserved, account this. I believe
 426 +        * inode's fields are protected by inode->i_sem */
 427 +       EXT4_I(inode)->i_blocks_reserved += blocks;
 428 +       EXT4_I(inode)->i_md_reserved += mdb;
 429 +       spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
 430 +
 431 +       /* we have reserved space on a disk for the page */
 432 +       SetPageBooked(page);
 433 +       return 0;
 434 +}
 435 +
 436 +/*
 437 + * release space reserved for @blocks of data
 438 + * @used signals that @blocks got really allocated and we just
 439 + * need to release corresponded over-reserved metadata
 440 + */
 441 +int inline ext4_wb_release_space(struct inode *inode, int blocks, int used)
 442 +{
 443 +       int total, mdb, release;
 444 +
 445 +       spin_lock(&EXT4_I(inode)->i_wb_reserved_lock);
 446 +
 447 +       total = EXT4_I(inode)->i_blocks_reserved - blocks;
 448 +       mdb = ext4_ext_calc_metadata_amount(inode, total);
 449 +
 450 +       /* if blockmap needs lesser metadata, we may release difference */
 451 +       BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
 452 +       mdb = EXT4_I(inode)->i_md_reserved - mdb;
 453 +
 454 +       release = mdb;
 455 +       /* drop reservation only for non-used blocks */
 456 +       if (!used)
 457 +               release += blocks;
 458 +       wb_debug("%u %s: release %d/%d blocks from %u/%u reserved"
 459 +                       " for inode %lu\n", blocks,
 460 +                       used ? "allocated" : "dropped", used ? 0 : blocks,
 461 +                       mdb, EXT4_I(inode)->i_blocks_reserved,
 462 +                       EXT4_I(inode)->i_md_reserved, inode->i_ino);
 463 +       if (release)
 464 +               ext4_release_blocks(inode->i_sb, release);
 465 +
 466 +       /* update per-inode reservations */
 467 +       BUG_ON(blocks > EXT4_I(inode)->i_blocks_reserved);
 468 +       EXT4_I(inode)->i_blocks_reserved -= blocks;
 469 +       BUG_ON(mdb > EXT4_I(inode)->i_md_reserved);
 470 +       EXT4_I(inode)->i_md_reserved -= mdb;
 471 +
 472 +       spin_unlock(&EXT4_I(inode)->i_wb_reserved_lock);
 473 +
 474 +       return 0;
 475 +}
 476 +
 477 +static inline int ext4_wb_drop_page_reservation(struct page *page)
 478 +{
 479 +       /* we just allocated blocks for this page. those blocks (and
 480 +        * probably metadata for them) were reserved before. now we
 481 +        * should drop reservation mark from the page. if we didn't
 482 +        * do that then ->invalidatepage() may think page still holds
 483 +        * reserved blocks. we could release reserved blocks right
 484 +        * now, but I'd prefer to make this once per several blocks */
 485 +       wb_debug("drop reservation from page %lu from inode %lu\n",
 486 +                       page->index, page->mapping->host->i_ino);
 487 +       BUG_ON(!PageBooked(page));
 488 +       ClearPageBooked(page);
 489 +       return 0;
 490 +}
 491 +
 492 +static int ext4_wb_submit_extent(struct ext4_wb_control *wc, handle_t *handle,
 493 +                                       struct ext4_extent *ex, int new)
 494 +{
 495 +       struct inode *inode = wc->mapping->host;
 496 +       int blkbits = inode->i_blkbits;
 497 +       struct page *page;
 498 +       unsigned long blk, off, len, remain;
 499 +       unsigned long pstart, plen, prev;
 500 +       struct bio *bio = NULL;
 501 +       int nr_pages;
 502 +
 503 +       /*
 504 +        * we have list of pages in wc and block numbers in ex
 505 +        * let's cook bios from them and start real I/O
 506 +        */
 507 +
 508 +       BUG_ON(PAGE_CACHE_SHIFT < blkbits);
 509 +       BUG_ON(list_empty(&wc->list));
 510 +
 511 +       wb_debug("cook and submit bios for %u/%u/%u for %lu/%u\n",
 512 +                le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len),
 513 +                le32_to_cpu(ex->ee_start), wc->start, wc->len);
 514 +
 515 +       blk = le32_to_cpu(ex->ee_block);
 516 +       remain = le16_to_cpu(ex->ee_len);
 517 +       wc->extents++;
 518 +
 519 +       while (remain) {
 520 +               page = ext4_wb_pull_page(wc);
 521 +               if (page == NULL)
 522 +                       break;
 523 +
 524 +               pstart = page->index << (PAGE_CACHE_SHIFT - blkbits);
 525 +               plen = PAGE_SIZE >> blkbits;
 526 +               if (pstart > blk) {
 527 +                       /* probably extent covers long space and page
 528 +                        * to be written in the middle of it */
 529 +                       BUG_ON(pstart - blk >= remain);
 530 +                       remain -= pstart - blk;
 531 +                       blk = pstart;
 532 +               }
 533 +               BUG_ON(blk < pstart || blk >= pstart + plen);
 534 +
 535 +               BUG_ON(!PageUptodate(page));
 536 +               /* page can get here via mmap(2)
 537 +                * BUG_ON(!PagePrivate(page));*/
 538 +               BUG_ON(new && PageMappedToDisk(page));
 539 +               BUG_ON(!new && !PageMappedToDisk(page));
 540 +               SetPageMappedToDisk(page);
 541 +               if (new && PagePrivate(page)) {
 542 +                       /* space is just allocated and it was reserved in
 543 +                        * ->commit_write(). time to release reservation.
 544 +                        * space may not be reserved if page gets dirty
 545 +                        * via mmap. should we reserve it in ->mmap() ? */
 546 +                       prev = min(plen, remain);
 547 +                       ext4_wb_drop_page_reservation(page);
 548 +                       wc->blocks_to_release += prev;
 549 +               }
 550 +
 551 +alloc_new_bio:
 552 +               if (bio == NULL) {
 553 +                       /* +2 because head/tail may belong to different pages */
 554 +                       nr_pages = (le16_to_cpu(ex->ee_len) -
 555 +                                   (blk - le32_to_cpu(ex->ee_block)));
 556 +                       nr_pages = (nr_pages >> (PAGE_CACHE_SHIFT - blkbits));
 557 +                       off = le32_to_cpu(ex->ee_start) +
 558 +                             (blk - le32_to_cpu(ex->ee_block));
 559 +                       bio = ext4_wb_bio_alloc(inode, off, nr_pages + 2);
 560 +                       if (bio == NULL)
 561 +                               return -ENOMEM;
 562 +               }
 563 +
 564 +               off = (blk - pstart) << blkbits;
 565 +               prev = min(plen, remain);
 566 +               len = prev << blkbits;
 567 +               if (bio_add_page(bio, page, len, off) < len) {
 568 +                       bio = ext4_wb_bio_submit(bio, handle);
 569 +                       goto alloc_new_bio;
 570 +               }
 571 +               remain -= prev;
 572 +               blk += prev;
 573 +               if (blk < pstart + plen) {
 574 +                       /* extent covers part of the page only.
 575 +                        * it's possible that next extent covers
 576 +                        * the tail. so, we leave page */
 577 +                       printk("blk %lu pstart %lu plen %lu remain"
 578 +                               " %lu prev %lu\n",
 579 +                               blk, pstart, plen, remain, prev);
 580 +                       wc->pages->start--;
 581 +                       BUG_ON(remain != 0);
 582 +               }
 583 +       }
 584 +       if (bio)
 585 +               ext4_wb_bio_submit(bio, handle);
 586 +       BUG_ON(new && remain != 0);
 587 +       return 0;
 588 +}
 589 +
 590 +static ext4_fsblk_t
 591 +ext4_wb_find_goal(struct inode *inode, struct ext4_ext_path *path,
 592 +                       ext4_fsblk_t block)
 593 +{
 594 +       struct ext4_inode_info *ei = EXT4_I(inode);
 595 +       ext4_fsblk_t bg_start;
 596 +       unsigned long colour;
 597 +       int depth;
 598 +
 599 +       if (path) {
 600 +               struct ext4_extent *ex;
 601 +               depth = path->p_depth;
 602 +
 603 +               /* try to predict block placement */
 604 +               ex = path[depth].p_ext;
 605 +               if (ex)
 606 +                       return le32_to_cpu(ex->ee_start) +
 607 +                              (block - le32_to_cpu(ex->ee_block));
 608 +
 609 +               /* it looks index is empty
 610 +                * try to find starting from index itself */
 611 +               if (path[depth].p_bh)
 612 +                       return path[depth].p_bh->b_blocknr;
 613 +       }
 614 +
 615 +       /* OK. use inode's group */
 616 +       bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
 617 +               le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
 618 +       colour = (current->pid % 16) *
 619 +                       (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 620 +       return bg_start + colour + block;
 621 +}
 622 +
 623 +static int ext4_wb_handle_extent(struct inode *inode,
 624 +                                       struct ext4_ext_path *path,
 625 +                                       struct ext4_ext_cache *ec,
 626 +                                       void *cbdata)
 627 +{
 628 +       struct ext4_wb_control *wc = cbdata;
 629 +       struct super_block *sb = inode->i_sb;
 630 +       ext4_fsblk_t goal, pblock;
 631 +       unsigned long tgen, count;
 632 +       struct ext4_extent nex;
 633 +       loff_t new_i_size;
 634 +       handle_t *handle;
 635 +       int i, err;
 636 +
 637 +       if (ec->ec_type == EXT4_EXT_CACHE_EXTENT) {
 638 +               /*
 639 +                * The extent is already allocated. The only thing
 640 +                * we have to do is to flush correspondend pages.
 641 +                */
 642 +               wb_debug("extent %u/%u/%u exist\n",
 643 +                               (unsigned) ec->ec_block,
 644 +                               (unsigned) ec->ec_len,
 645 +                               (unsigned) ec->ec_start);
 646 +               nex.ee_start = cpu_to_le32(ec->ec_start);
 647 +               nex.ee_block = cpu_to_le32(ec->ec_block);
 648 +               nex.ee_len = cpu_to_le16(ec->ec_len);
 649 +               err = ext4_wb_submit_extent(wc, NULL, &nex, 0);
 650 +
 651 +               /* correct on-disk size, if we grow within
 652 +                * already allocated block */
 653 +               new_i_size = (loff_t) le32_to_cpu(nex.ee_block) +
 654 +                            le16_to_cpu(nex.ee_len);
 655 +               new_i_size = new_i_size << inode->i_blkbits;
 656 +               if (new_i_size > i_size_read(inode))
 657 +                       new_i_size = i_size_read(inode);
 658 +               if (new_i_size > EXT4_I(inode)->i_disksize) {
 659 +                       EXT4_I(inode)->i_disksize = new_i_size;
 660 +                       mutex_unlock(&EXT4_I(inode)->truncate_mutex);
 661 +                       ext4_dirty_inode(inode);
 662 +                       mutex_lock(&EXT4_I(inode)->truncate_mutex);
 663 +               }
 664 +               return err;
 665 +       }
 666 +
 667 +       wb_debug("extent %u/%u DOES NOT exist\n", ec->ec_block, ec->ec_len);
 668 +
 669 +       /* space for some pages we want to flush hasn't allocated
 670 +        * yet. so, it's time to allocate space */
 671 +       tgen = EXT4_I(inode)->i_ext_generation;
 672 +       count = ext4_ext_calc_credits_for_insert(inode, path);
 673 +       mutex_unlock(&EXT4_I(inode)->truncate_mutex);
 674 +
 675 +       handle = ext4_journal_start(inode,
 676 +                               count + EXT4_DATA_TRANS_BLOCKS(sb) + 1);
 677 +       if (IS_ERR(handle)) {
 678 +               mutex_lock(&EXT4_I(inode)->truncate_mutex);
 679 +               return PTR_ERR(handle);
 680 +       }
 681 +
 682 +       /* FIXME: we could analyze current path and advice allocator
 683 +        * to find additional blocks if goal can't be allocated
 684 +        * this is for better interaction between extents and mballoc
 685 +        * plus this should improve overall performance */
 686 +
 687 +       mutex_lock(&EXT4_I(inode)->truncate_mutex);
 688 +       if (tgen != EXT4_I(inode)->i_ext_generation) {
 689 +               /* the tree has changed. so path can be invalid at moment */
 690 +               ext4_journal_stop(handle);
 691 +               return EXT_REPEAT;
 692 +       }
 693 +
 694 +       goal = ext4_wb_find_goal(inode, path, ec->ec_block);
 695 +       count = ec->ec_len;
 696 +
 697 +       /* if this is a tail of closed file, ask allocator don't preallocate */
 698 +       new_i_size = i_size_read(inode) + sb->s_blocksize - 1;
 699 +       new_i_size = new_i_size >> inode->i_blkbits;
 700 +       if (ec->ec_block + count == new_i_size &&
 701 +                       !atomic_read(&inode->i_writecount)) {
 702 +               /* XXX: disable preallocation for tail */
 703 +       }
 704 +
 705 +       /* this is a hack to tell the allocator that blocks
 706 +        * we are going to allocated are already reserved */
 707 +       EXT4_I(inode)->i_state |= EXT4_STATE_BLOCKS_RESERVED;
 708 +       pblock = ext4_new_blocks(handle, inode, goal, &count, &err);
 709 +       EXT4_I(inode)->i_state &= ~EXT4_STATE_BLOCKS_RESERVED;
 710 +
 711 +       if (!pblock)
 712 +               goto out;
 713 +
 714 +       BUG_ON(count > ec->ec_len);
 715 +       BUG_ON(count == 0);
 716 +       wb_debug("allocated %llu/%lu for %lu (asked %u)\n",
 717 +                       pblock, count, inode->i_ino, ec->ec_len);
 718 +
 719 +       /* insert new extent */
 720 +       nex.ee_start = cpu_to_le32(pblock);
 721 +       nex.ee_start_hi = 0;
 722 +       nex.ee_len = cpu_to_le16(count);
 723 +       nex.ee_block = cpu_to_le32(ec->ec_block);
 724 +       err = ext4_ext_insert_extent(handle, inode, path, &nex);
 725 +       if (err)
 726 +               goto out;
 727 +
 728 +       /*
 729 +        * Putting len of the actual extent we just inserted,
 730 +        * we are asking ext4_ext_walk_space() to continue
 731 +        * scaning after that block
 732 +        */
 733 +       ec->ec_len = le16_to_cpu(nex.ee_len);
 734 +       BUG_ON(nex.ee_len == 0);
 735 +
 736 +#ifdef EXT4_WB_STATS
 737 +       atomic_add(le16_to_cpu(nex.ee_len),
 738 +                  &EXT4_SB(inode->i_sb)->s_wb_allocated);
 739 +#endif
 740 +
 741 +       wb_debug("inserted %lu/%lu/%lu for %lu (asked %u)\n",
 742 +               (unsigned long) le32_to_cpu(nex.ee_block),
 743 +               (unsigned long) le16_to_cpu(nex.ee_len),
 744 +               (unsigned long) le32_to_cpu(nex.ee_start),
 745 +               inode->i_ino, ec->ec_len);
 746 +
 747 +       /*
 748 +        * Important! The nex can change after insert. So do not
 749 +        * use ec for following
 750 +        */
 751 +
 752 +       /* block have been allocated for data, so time to drop dirty
 753 +        * in correspondend buffer_heads to prevent corruptions */
 754 +       for (i = 0; i < le16_to_cpu(nex.ee_len); i++)
 755 +               unmap_underlying_metadata(sb->s_bdev,
 756 +                                         le32_to_cpu(nex.ee_start) + i);
 757 +
 758 +       /* correct on-disk inode size */
 759 +       if (le16_to_cpu(nex.ee_len) > 0) {
 760 +               new_i_size = (loff_t) le32_to_cpu(nex.ee_block) +
 761 +                            le16_to_cpu(nex.ee_len);
 762 +               new_i_size = new_i_size << inode->i_blkbits;
 763 +               if (new_i_size > i_size_read(inode))
 764 +                       new_i_size = i_size_read(inode);
 765 +               if (new_i_size > EXT4_I(inode)->i_disksize) {
 766 +                       EXT4_I(inode)->i_disksize = new_i_size;
 767 +                       err = ext4_mark_inode_dirty(handle, inode);
 768 +               }
 769 +       }
 770 +
 771 +       if (ext4_should_order_data(inode))
 772 +               err = ext4_wb_submit_extent(wc, handle, &nex, 1);
 773 +       else
 774 +               err = ext4_wb_submit_extent(wc, NULL, &nex, 1);
 775 +
 776 +       /* we don't want to recalculate needed reservation for
 777 +        * each page. we may do this for each new extent */
 778 +       ext4_wb_release_space(inode, wc->blocks_to_release, 1);
 779 +       wc->blocks_to_release = 0;
 780 +
 781 +out:
 782 +       ext4_journal_stop(handle);
 783 +       if (err)
 784 +               printk("EXT4-fs: writeback error = %d\n", err);
 785 +       return err;
 786 +}
 787 +
 788 +static int ext4_wb_flush(struct ext4_wb_control *wc)
 789 +{
 790 +       struct list_head *cur, *tmp;
 791 +       struct inode *inode;
 792 +       int err, num = 0;
 793 +
 794 +       if (wc->len == 0)
 795 +               return 0;
 796 +
 797 +       inode = wc->mapping->host;
 798 +       wb_debug("start flushing %lu/%u from inode %lu\n",
 799 +                       wc->start, wc->len, inode->i_ino);
 800 +
 801 +       wc->pages = list_entry(wc->list.next, struct ext4_wb_pages, list);
 802 +       wc->extents = 0;
 803 +
 804 +       mutex_lock(&EXT4_I(inode)->truncate_mutex);
 805 +       /* FIXME: last page may be partial */
 806 +       err = ext4_ext_walk_space(inode, wc->start, wc->len,
 807 +                                       ext4_wb_handle_extent, wc);
 808 +       mutex_unlock(&EXT4_I(inode)->truncate_mutex);
 809 +
 810 +       list_for_each_safe(cur, tmp, &wc->list) {
 811 +               struct ext4_wb_pages *wp;
 812 +               wp = list_entry(cur, struct ext4_wb_pages, list);
 813 +               if (err) {
 814 +                       while (wp->start < wp->num) {
 815 +                               struct page *page = wp->pages[wp->start];
 816 +                               BUG_ON(!PageWriteback(page));
 817 +                               end_page_writeback(page);
 818 +                               __set_page_dirty_nobuffers(page);
 819 +                               wp->start++;
 820 +                       }
 821 +               } else {
 822 +                       BUG_ON(num != 0);
 823 +                       BUG_ON(wp->start != wp->num - 1 &&
 824 +                                       wp->start != wp->num);
 825 +               }
 826 +               list_del(&wp->list);
 827 +               kfree(wp);
 828 +               num++;
 829 +       }
 830 +       wc->pages = NULL;
 831 +       wc->len = 0;
 832 +       wc->extents = 0;
 833 +
 834 +       return err;
 835 +}
 836 +
 837 +static int ext4_wb_add_page(struct ext4_wb_control *wc, struct page *page)
 838 +{
 839 +       struct ext4_wb_pages *wp = wc->pages;
 840 +
 841 +       if (wp == NULL || wp->num == WB_PAGES_PER_ARRAY) {
 842 +               wp = kmalloc(sizeof(struct ext4_wb_pages), GFP_NOFS);
 843 +               if (wp == NULL) {
 844 +                       printk("no mem for ext4_wb_pages!\n");
 845 +                       return -ENOMEM;
 846 +               }
 847 +               wp->num = 0;
 848 +               wp->start = 0;
 849 +               list_add_tail(&wp->list, &wc->list);
 850 +               wc->pages = wp;
 851 +       }
 852 +
 853 +       wp->pages[wp->num] = page;
 854 +       wp->num++;
 855 +
 856 +       return 0;
 857 +}
 858 +
 859 +static inline void
 860 +ext4_wb_init_control(struct ext4_wb_control *wc, struct address_space *mapping)
 861 +{
 862 +       wc->mapping = mapping;
 863 +       wc->len = 0;
 864 +       wc->blocks_to_release = 0;
 865 +       INIT_LIST_HEAD(&wc->list);
 866 +       wc->pages = NULL;
 867 +}
 868 +
 869 +static inline int
 870 +ext4_wb_can_merge(struct ext4_wb_control *wc, unsigned long next)
 871 +{
 872 +       if (wc->start + wc->len == next &&
 873 +                       wc->len <= WB_MAX_PAGES_PER_EXTENT)
 874 +               return 1;
 875 +       return 0;
 876 +}
 877 +
 878 +int ext4_wb_writepages(struct address_space *mapping,
 879 +                               struct writeback_control *wbc)
 880 +{
 881 +       struct backing_dev_info *bdi = mapping->backing_dev_info;
 882 +       struct inode *inode = mapping->host;
 883 +       int nr_pages, i, err = 0, done = 0;
 884 +       struct ext4_wb_control wc;
 885 +       struct pagevec pvec;
 886 +       pgoff_t index = 0;
 887 +       int written = 0;
 888 +       int extents = 0;
 889 +       pgoff_t pindex = 0;
 890 +
 891 +       wb_debug("->writepages on inode %lu (%u reserved)\n",
 892 +               inode->i_ino, EXT4_I(inode)->i_blocks_reserved);
 893 +#ifdef EXT4_WB_SKIP_SMALL
 894 +       if (wbc->nr_to_write <= 64 && wbc->sync_mode == WB_SYNC_NONE)
 895 +               return 0;
 896 +#endif
 897 +       atomic_inc(&EXT4_I(inode)->i_wb_writers);
 898 +#ifdef EXT4_WB_STATS
 899 +       atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_reqs);
 900 +       atomic_add(wbc->nr_to_write, &EXT4_SB(inode->i_sb)->s_wb_nr_to_write);
 901 +       if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
 902 +               atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions);
 903 +#endif
 904 +
 905 +       /* skip opened-for-write small files
 906 +        * XXX: what do we do if most of files hit the condition? */
 907 +       if (wbc->sync_mode == WB_SYNC_NONE &&
 908 +                       atomic_read(&inode->i_writecount) &&
 909 +                       i_size_read(inode) <= 64*1024) {
 910 +               return 0;
 911 +       }
 912 +
 913 +       ext4_wb_init_control(&wc, mapping);
 914 +
 915 +       pagevec_init(&pvec, 0);
 916 +       while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 917 +                                       PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
 918 +               for (i = 0; i < nr_pages; i++) {
 919 +                       struct page *page = pvec.pages[i];
 920 +
 921 +                       lock_page(page);
 922 +
 923 +                       if (wbc->sync_mode != WB_SYNC_NONE)
 924 +                               wait_on_page_writeback(page);
 925 +
 926 +                       if (page->mapping != mapping) {
 927 +                               unlock_page(page);
 928 +                               continue;
 929 +                       }
 930 +                       if (PageWriteback(page)) {
 931 +                               unlock_page(page);
 932 +                               continue;
 933 +                       }
 934 +
 935 +                       if (wc.len && ext4_wb_can_merge(&wc, page->index) &&
 936 +                                       wbc->nr_to_write <= 0) {
 937 +                               /*
 938 +                                * If we already exhausted blocks we got
 939 +                                * to write and new extent starts, stop
 940 +                                * writeback
 941 +                                */
 942 +                               unlock_page(page);
 943 +                               done = 1;
 944 +                               break;
 945 +
 946 +                       }
 947 +
 948 +                       if (!clear_page_dirty_for_io(page)) {
 949 +                               unlock_page(page);
 950 +                               continue;
 951 +                       }
 952 +
 953 +                       set_page_writeback(page);
 954 +                       unlock_page(page);
 955 +
 956 +                       if (wc.len == 0) {
 957 +                               wc.start = page->index;
 958 +                               wc.len = 1;
 959 +                               extents++;
 960 +                       } else if (ext4_wb_can_merge(&wc, page->index)) {
 961 +                               wc.len++;
 962 +                       } else {
 963 +                               /* end of current extent: flush it ... */
 964 +#if 0
 965 +                               if (wc.len < 64 && wc.len > 0) {
 966 +                                       printk("#%u: wow! short extent %d for flush on #%lu\n",
 967 +                                               (unsigned) current->pid, wc.len, inode->i_ino);
 968 +                                       printk("#%u: done = %d, nr_to_write %ld, sync = %d\n",
 969 +                                               (unsigned) current->pid, done, wbc->nr_to_write,
 970 +                                               wbc->sync_mode);
 971 +                                       printk("#%u: written %d, extents %d\n",
 972 +                                               (unsigned) current->pid, written, extents);
 973 +                                       printk("#%u: cur %lu, prev %lu\n",
 974 +                                               (unsigned) current->pid,
 975 +                                               (unsigned long) page->index,
 976 +                                               (unsigned long) pindex);
 977 +                               }
 978 +#endif
 979 +                               err = ext4_wb_flush(&wc);
 980 +                               if (err) {
 981 +                                       done = 1;
 982 +                                       end_page_writeback(page);
 983 +                                       break;
 984 +                               }
 985 +
 986 +                               /* ... and start new one */
 987 +                               BUG_ON(!PageWriteback(page));
 988 +                               wc.start = page->index;
 989 +                               wc.len = 1;
 990 +                               extents++;
 991 +                       }
 992 +
 993 +                       pindex = page->index;
 994 +                       err = ext4_wb_add_page(&wc, page);
 995 +                       if (err) {
 996 +                               done = 1;
 997 +                               end_page_writeback(page);
 998 +                               break;
 999 +                       }
1000 +                       written++;
1001 +
1002 +                       wbc->nr_to_write--;
1003 +#if 0
1004 +                       if ((--(wbc->nr_to_write) <= 0))
1005 +                               done = 1;
1006 +#endif
1007 +                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
1008 +#ifdef EXT4_WB_STATS
1009 +                               atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_congested);
1010 +#endif
1011 +                               wbc->encountered_congestion = 1;
1012 +                               done = 1;
1013 +                       }
1014 +               }
1015 +               pagevec_release(&pvec);
1016 +       }
1017 +       if (!err) {
1018 +#ifdef EXT4_WB_SKIP_SMALL
1019 +               if (wc.len > 0 && wc.len < 64
1020 +                               && wbc->sync_mode == WB_SYNC_NONE) {
1021 +                       struct list_head *cur, *tmp;
1022 +                       list_for_each_safe(cur, tmp, &wc.list) {
1023 +                               struct ext4_wb_pages *wp;
1024 +                               wp = list_entry(cur, struct ext4_wb_pages,
1025 +                                               list);
1026 +                               for (i = wp->start; i < wp->num; i++) {
1027 +                                       struct page *page = wp->pages[i];
1028 +                                       BUG_ON(!PageWriteback(page));
1029 +                                       end_page_writeback(page);
1030 +                                       __set_page_dirty_nobuffers(page);
1031 +                               }
1032 +                               wbc->nr_to_write += i;
1033 +                               list_del(&wp->list);
1034 +                               kfree(wp);
1035 +                       }
1036 +               } else
1037 +#endif
1038 +                       ext4_wb_flush(&wc);
1039 +       }
1040 +
1041 +       atomic_dec(&EXT4_I(inode)->i_wb_writers);
1042 +
1043 +#ifdef EXT4_WB_STATS
1044 +       atomic_add(written, &EXT4_SB(inode->i_sb)->s_wb_blocks);
1045 +       atomic_add(extents, &EXT4_SB(inode->i_sb)->s_wb_extents);
1046 +#endif
1047 +       return 0;
1048 +}
1049 +
1050 +static void ext4_wb_clear_page(struct page *page, int from, int to)
1051 +{
1052 +       void *kaddr;
1053 +
1054 +       if (to < PAGE_CACHE_SIZE || from > 0) {
1055 +               kaddr = kmap_atomic(page, KM_USER0);
1056 +               if (PAGE_CACHE_SIZE > to)
1057 +                       memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1058 +               if (0 < from)
1059 +                       memset(kaddr, 0, from);
1060 +               flush_dcache_page(page);
1061 +               kunmap_atomic(kaddr, KM_USER0);
1062 +       }
1063 +}
1064 +
1065 +int ext4_wb_prepare_write(struct file *file, struct page *page,
1066 +                             unsigned from, unsigned to)
1067 +{
1068 +       struct inode *inode = page->mapping->host;
1069 +       struct buffer_head bh, *bhw = &bh;
1070 +       int err = 0;
1071 +
1072 +       wb_debug("prepare page %lu (%u-%u) for inode %lu\n",
1073 +                       page->index, from, to, page->mapping->host->i_ino);
1074 +
1075 +       /* if page is uptodate this means that ->prepare_write() has
1076 +        * been called on page before and page is mapped to disk or
1077 +        * we did reservation. page is protected and nobody can
1078 +        * access it. hence, it safe to use page->private to pass
1079 +        * flag that ->commit_write() has to reserve blocks. because
1080 +        * an error may occur after ->prepare_write() we should not
1081 +        * reserve block here. it's better to do in ->commit_write()
1082 +        * when we're sure page is to be written */
1083 +       page->private = 0;
1084 +       if (!PageUptodate(page)) {
1085 +               /* first write to this page */
1086 +               bh.b_state = 0;
1087 +               err = ext4_get_block(inode, page->index, bhw, 0);
1088 +               if (err)
1089 +                       return err;
1090 +               if (!buffer_mapped(bhw)) {
1091 +                       /* this block isn't allocated yet, reserve space */
1092 +                       wb_debug("reserve space for new block\n");
1093 +                       page->private = 1;
1094 +                       ext4_wb_clear_page(page, from, to);
1095 +                       ClearPageMappedToDisk(page);
1096 +               } else {
1097 +                       /* block is already mapped, so no need to reserve */
1098 +                       BUG_ON(PagePrivate(page));
1099 +                       if (to - from < PAGE_CACHE_SIZE) {
1100 +                               wb_debug("read block %u\n",
1101 +                                               (unsigned) bhw->b_blocknr);
1102 +                               set_bh_page(bhw, page, 0);
1103 +                               bhw->b_this_page = NULL;
1104 +                               bhw->b_size = 1 << inode->i_blkbits;
1105 +                               atomic_set(&bhw->b_count, 1);
1106 +                               ll_rw_block(READ, 1, &bhw);
1107 +                               wait_on_buffer(bhw);
1108 +                               if (!buffer_uptodate(bhw))
1109 +                                       return -EIO;
1110 +                       }
1111 +                       SetPageMappedToDisk(page);
1112 +               }
1113 +       } else if (!PageMappedToDisk(page) && !PagePrivate(page)) {
1114 +               /* this page was a hole at time of mmap() calling
1115 +                * now someone wants to modify it by sys_write() */
1116 +               wb_debug("reserve block for hole\n");
1117 +               page->private = 1;
1118 +       }
1119 +
1120 +       return 0;
1121 +}
1122 +
1123 +int ext4_wb_commit_write(struct file *file, struct page *page,
1124 +                            unsigned from, unsigned to)
1125 +{
1126 +       loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1127 +       struct inode *inode = page->mapping->host;
1128 +       int err = 0;
1129 +
1130 +       wb_debug("commit page %lu (%u-%u) for inode %lu\n",
1131 +                       page->index, from, to, inode->i_ino);
1132 +
1133 +       /* mark page private so that we get
1134 +        * called to invalidate/release page */
1135 +       SetPagePrivate(page);
1136 +
1137 +       if (!PageBooked(page) && !PageMappedToDisk(page)) {
1138 +               /* ->prepare_write() observed that block for this
1139 +                * page hasn't been allocated yet. there fore it
1140 +                * asked to reserve block for later allocation */
1141 +               BUG_ON(page->private == 0);
1142 +               page->private = 0;
1143 +               err = ext4_wb_reserve_space_page(page, 1);
1144 +               if (err)
1145 +                       return err;
1146 +       }
1147 +
1148 +       /* ok. block for this page is allocated already or it has
1149 +        * been reserved succesfully. so, user may use it */
1150 +       __set_page_dirty_nobuffers(page);
1151 +
1152 +       SetPageUptodate(page);
1153 +
1154 +       /* correct in-core size,  on-disk size will
1155 +        * be corrected upon allocation */
1156 +       if (pos > inode->i_size) {
1157 +               i_size_write(inode, pos);
1158 +               mark_inode_dirty(inode);
1159 +       }
1160 +
1161 +       return err;
1162 +}
1163 +
1164 +int ext4_wb_write_single_page(struct page *page,
1165 +                                       struct writeback_control *wbc)
1166 +{
1167 +       struct inode *inode = page->mapping->host;
1168 +       struct ext4_wb_control wc;
1169 +       int err;
1170 +
1171 +       atomic_inc(&EXT4_I(inode)->i_wb_writers);
1172 +
1173 +#ifdef EXT4_WB_STATS
1174 +       atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_single_pages);
1175 +       if (atomic_read(&EXT4_I(inode)->i_wb_writers) != 1)
1176 +               atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_collisions_sp);
1177 +#endif
1178 +
1179 +       ext4_wb_init_control(&wc, page->mapping);
1180 +
1181 +       BUG_ON(PageWriteback(page));
1182 +       set_page_writeback(page);
1183 +       unlock_page(page);
1184 +
1185 +       wc.start = page->index;
1186 +       wc.len = 1;
1187 +
1188 +       err = ext4_wb_add_page(&wc, page);
1189 +       if (err) {
1190 +               printk(KERN_ERR "EXT4-fs: cant add page at %s:%d - %d\n",
1191 +                               __FILE__, __LINE__, err);
1192 +               end_page_writeback(page);
1193 +               return err;
1194 +       }
1195 +       err = ext4_wb_flush(&wc);
1196 +       atomic_dec(&EXT4_I(inode)->i_wb_writers);
1197 +
1198 +       return err;
1199 +}
1200 +
1201 +int ext4_wb_writepage(struct page *page, struct writeback_control *wbc)
1202 +{
1203 +       struct inode *inode = page->mapping->host;
1204 +       loff_t i_size = i_size_read(inode);
1205 +       pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
1206 +       unsigned offset;
1207 +       void *kaddr;
1208 +
1209 +       wb_debug("writepage %lu from inode %lu\n", page->index, inode->i_ino);
1210 +
1211 +       /*
1212 +        * FIXME: just to play ...
1213 +        * If another thread is writing inode's data and the page
1214 +        * hasn't space on a disk yet, leave it for that thread
1215 +        */
1216 +#if 1
1217 +       if (atomic_read(&EXT4_I(page->mapping->host)->i_wb_writers)
1218 +                       && !PageMappedToDisk(page)) {
1219 +               __set_page_dirty_nobuffers(page);
1220 +               unlock_page(page);
1221 +               return 0;
1222 +       }
1223 +#endif
1224 +
1225 +       /* we give up here if we're reentered, because
1226 +        * it might be for a different filesystem  */
1227 +       if (ext4_journal_current_handle()) {
1228 +               __set_page_dirty_nobuffers(page);
1229 +               unlock_page(page);
1230 +               return 0;
1231 +       }
1232 +
1233 +       /* Is the page fully inside i_size? */
1234 +       if (page->index < end_index)
1235 +               return ext4_wb_write_single_page(page, wbc);
1236 +
1237 +       /* Is the page fully outside i_size? (truncate in progress) */
1238 +       offset = i_size & (PAGE_CACHE_SIZE-1);
1239 +       if (page->index >= end_index + 1 || !offset) {
1240 +               /*
1241 +                * The page may have dirty, unmapped buffers.  For example,
1242 +                * they may have been added in ext4_writepage().  Make them
1243 +                * freeable here, so the page does not leak.
1244 +                */
1245 +               ext4_wb_invalidatepage(page, 0);
1246 +               unlock_page(page);
1247 +               return 0; /* don't care */
1248 +       }
1249 +
1250 +       /*
1251 +        * The page straddles i_size.  It must be zeroed out on each and every
1252 +        * writepage invocation because it may be mmapped.  "A file is mapped
1253 +        * in multiples of the page size.  For a file that is not a multiple of
1254 +        * the  page size, the remaining memory is zeroed when mapped, and
1255 +        * writes to that region are not written out to the file."
1256 +        */
1257 +       kaddr = kmap_atomic(page, KM_USER0);
1258 +       memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1259 +       flush_dcache_page(page);
1260 +       kunmap_atomic(kaddr, KM_USER0);
1261 +       return ext4_wb_write_single_page(page, wbc);
1262 +}
1263 +
1264 +int ext4_wb_releasepage(struct page *page, gfp_t wait)
1265 +{
1266 +       wb_debug("release %sM%sR page %lu from inode %lu (wait %d)\n",
1267 +                       PageMappedToDisk(page) ? "" : "!",
1268 +                       PageBooked(page) ? "" : "!",
1269 +                       page->index, page->mapping->host->i_ino, wait);
1270 +
1271 +       if (PageWriteback(page))
1272 +               return 0;
1273 +
1274 +       if (PagePrivate(page))
1275 +               ClearPagePrivate(page);
1276 +       return 0;
1277 +}
1278 +
1279 +void ext4_wb_invalidatepage(struct page *page, unsigned long offset)
1280 +{
1281 +       struct inode *inode = page->mapping->host;
1282 +       int ret = 0;
1283 +
1284 +       /* ->invalidatepage() is called when page is marked Private.
1285 +        * for our page being Private mean that space has been
1286 +        * reserved for this page and it is being truncated. so,
1287 +        * it's time to drop reservation */
1288 +       wb_debug("invalidate %sM%sR page %lu from inode %lu (offset %lu)\n",
1289 +                       PageMappedToDisk(page) ? "" : "!",
1290 +                       PageBooked(page) ? "" : "!",
1291 +                       page->index, inode->i_ino, offset);
1292 +
1293 +       if (offset == 0) {
1294 +               if (PageBooked(page)) {
1295 +                       atomic_inc(&EXT4_SB(inode->i_sb)->s_wb_dropped);
1296 +                       ext4_wb_release_space(inode, 1, 0);
1297 +                       ext4_wb_drop_page_reservation(page);
1298 +               }
1299 +               ret = try_to_release_page(page, 0);
1300 +       }
1301 +       return;
1302 +}
1303 +
1304 +int ext4_wb_block_truncate_page(handle_t *handle, struct page *page,
1305 +                               struct address_space *mapping, loff_t from)
1306 +{
1307 +       unsigned offset = from & (PAGE_CACHE_SIZE-1);
1308 +       struct inode *inode = mapping->host;
1309 +       struct buffer_head bh, *bhw = &bh;
1310 +       unsigned blocksize, length;
1311 +       void *kaddr;
1312 +       int err = 0;
1313 +
1314 +       wb_debug("partial truncate from %lu on page %lu from inode %lu\n",
1315 +                       (unsigned long) from, page->index, inode->i_ino);
1316 +
1317 +       blocksize = inode->i_sb->s_blocksize;
1318 +       length = blocksize - (offset & (blocksize - 1));
1319 +
1320 +       /* if page isn't uptodate we have to check has it assigned block
1321 +        * if it has then that block is to be read before memset() */
1322 +       if (!PageUptodate(page)) {
1323 +               BUG_ON(PageMappedToDisk(page));
1324 +               bh.b_state = 0;
1325 +               err = ext4_get_block(inode, page->index, bhw, 0);
1326 +               if (err)
1327 +                       goto err_out;
1328 +               BUG_ON(buffer_new(bhw));
1329 +               if (buffer_mapped(bhw)) {
1330 +                       /* time to retrieve data from a disk */
1331 +                       wb_debug("read block %u for part.trunc on %lu\n",
1332 +                                       (unsigned) bhw->b_blocknr, page->index);
1333 +                       set_bh_page(bhw, page, 0);
1334 +                       bhw->b_this_page = NULL;
1335 +                       bhw->b_size = 1 << inode->i_blkbits;
1336 +                       atomic_set(&bhw->b_count, 1);
1337 +                       ll_rw_block(READ, 1, &bhw);
1338 +                       wait_on_buffer(bhw);
1339 +                       err = -EIO;
1340 +                       if (!buffer_uptodate(bhw))
1341 +                               goto err_out;
1342 +                       SetPageMappedToDisk(page);
1343 +               } else {
1344 +                       wb_debug("zero page %lu (part.trunc)\n", page->index);
1345 +                       offset = 0;
1346 +                       length = blocksize;
1347 +               }
1348 +       }
1349 +
1350 +       kaddr = kmap_atomic(page, KM_USER0);
1351 +       memset(kaddr + offset, 0, length);
1352 +       flush_dcache_page(page);
1353 +       kunmap_atomic(kaddr, KM_USER0);
1354 +       SetPageUptodate(page);
1355 +       __set_page_dirty_nobuffers(page);
1356 +
1357 +err_out:
1358 +       unlock_page(page);
1359 +       page_cache_release(page);
1360 +       return err;
1361 +}
1362 +
1363 +void ext4_wb_init(struct super_block *sb)
1364 +{
1365 +       if (!test_opt(sb, DELAYED_ALLOC))
1366 +               return;
1367 +
1368 +       if (PAGE_CACHE_SHIFT != sb->s_blocksize_bits) {
1369 +               printk(KERN_ERR "EXT4-fs: delayed allocation isn't"
1370 +                       "supported for PAGE_CACHE_SIZE != blocksize yet\n");
1371 +               clear_opt (EXT4_SB(sb)->s_mount_opt, DELAYED_ALLOC);
1372 +               return;
1373 +       }
1374 +       printk("EXT4-fs: delayed allocation enabled\n");
1375 +}
1376 +
1377 +void ext4_wb_release(struct super_block *sb)
1378 +{
1379 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1380 +
1381 +       if (!test_opt(sb, DELAYED_ALLOC))
1382 +               return;
1383 +
1384 +#ifdef EXT4_WB_STATS
1385 +       if (atomic_read(&sbi->s_wb_reqs) == 0)
1386 +               return;
1387 +
1388 +       printk("EXT4-fs: writeback: %d blocks %d extents in %d reqs (%d ave)\n",
1389 +               atomic_read(&sbi->s_wb_blocks),
1390 +               atomic_read(&sbi->s_wb_extents),
1391 +               atomic_read(&sbi->s_wb_reqs),
1392 +               atomic_read(&sbi->s_wb_blocks) / atomic_read(&sbi->s_wb_reqs));
1393 +       printk("EXT4-fs: writeback: %d nr_to_write, %d congestions, %d singles\n",
1394 +               atomic_read(&sbi->s_wb_nr_to_write),
1395 +               atomic_read(&sbi->s_wb_congested),
1396 +               atomic_read(&sbi->s_wb_single_pages));
1397 +       printk("EXT4-fs: writeback: %d collisions, %d single-page collisions\n",
1398 +               atomic_read(&sbi->s_wb_collisions),
1399 +               atomic_read(&sbi->s_wb_collisions_sp));
1400 +       printk("EXT4-fs: writeback: %d allocated, %d dropped\n",
1401 +               atomic_read(&sbi->s_wb_allocated),
1402 +               atomic_read(&sbi->s_wb_dropped));
1403 +#endif
1404 +}
1405 +
1406 Index: linux-2.6.23-rc2/include/linux/ext4_fs.h
1407 ===================================================================
1408 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs.h       2007-08-06 22:18:59.000000000 -0700
1409 +++ linux-2.6.23-rc2/include/linux/ext4_fs.h    2007-08-06 22:19:05.000000000 -0700
1410 @@ -488,6 +488,8 @@ do {                                                                               \
1411  #define EXT4_MOUNT_EXTENTS             0x400000 /* Extents support */
1412  #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
1413  #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
1414 +#define EXT4_MOUNT_DELAYED_ALLOC       0x2000000/* Delayed allocation support*/
1415 +
1416  /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
1417  #ifndef _LINUX_EXT2_FS_H
1418  #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
1419 @@ -1101,6 +1103,21 @@ ext4_get_blocks_wrap(handle_t *handle, s
1420  }
1421
1422
1423 +/* writeback.c */
1424 +extern int ext4_wb_writepages(struct address_space *,
1425 +                               struct writeback_control *);
1426 +extern int ext4_wb_prepare_write(struct file *file, struct page *page,
1427 +                             unsigned from, unsigned to);
1428 +extern int ext4_wb_commit_write(struct file *, struct page *, unsigned,
1429 +                               unsigned);
1430 +extern int ext4_wb_writepage(struct page *, struct writeback_control *);
1431 +extern void ext4_wb_invalidatepage(struct page *, unsigned long);
1432 +extern int ext4_wb_releasepage(struct page *, gfp_t);
1433 +extern int ext4_wb_block_truncate_page(handle_t *, struct page *,
1434 +                                       struct address_space *, loff_t);
1435 +extern void ext4_wb_init(struct super_block *);
1436 +extern void ext4_wb_release(struct super_block *);
1437 +
1438  #endif /* __KERNEL__ */
1439
1440  #endif /* _LINUX_EXT4_FS_H */
1441 Index: linux-2.6.23-rc2/include/linux/ext4_fs_extents.h
1442 ===================================================================
1443 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs_extents.h       2007-08-06 22:18:09.000000000 -0700
1444 +++ linux-2.6.23-rc2/include/linux/ext4_fs_extents.h    2007-08-06 22:19:05.000000000 -0700
1445 @@ -235,6 +235,7 @@ extern unsigned int ext4_ext_check_overl
1446  extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
1447  extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
1448  extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
1449 +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
1450
1451  #endif /* _LINUX_EXT4_EXTENTS */
1452
1453 Index: linux-2.6.23-rc2/include/linux/ext4_fs_i.h
1454 ===================================================================
1455 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs_i.h     2007-08-06 22:18:09.000000000 -0700
1456 +++ linux-2.6.23-rc2/include/linux/ext4_fs_i.h  2007-08-06 22:19:05.000000000 -0700
1457 @@ -158,6 +158,10 @@ struct ext4_inode_info {
1458          * struct timespec i_{a,c,m}time in the generic inode.
1459          */
1460         struct timespec i_crtime;
1461 +       __u32 i_blocks_reserved;
1462 +       __u32 i_md_reserved;
1463 +       spinlock_t i_wb_reserved_lock;  /* to protect i_md_reserved */
1464 +       atomic_t i_wb_writers;
1465  };
1466
1467  #endif /* _LINUX_EXT4_FS_I */
1468 Index: linux-2.6.23-rc2/include/linux/ext4_fs_sb.h
1469 ===================================================================
1470 --- linux-2.6.23-rc2.orig/include/linux/ext4_fs_sb.h    2007-08-06 22:18:59.000000000 -0700
1471 +++ linux-2.6.23-rc2/include/linux/ext4_fs_sb.h 2007-08-06 22:19:05.000000000 -0700
1472 @@ -97,6 +97,17 @@ struct ext4_sb_info {
1473         unsigned long s_ext_blocks;
1474         unsigned long s_ext_extents;
1475  #endif
1476 +
1477 +       atomic_t s_wb_congested;
1478 +       atomic_t s_wb_single_pages;
1479 +       atomic_t s_wb_collisions_sp;
1480 +       atomic_t s_wb_allocated;
1481 +       atomic_t s_wb_reqs;
1482 +       atomic_t s_wb_nr_to_write;
1483 +       atomic_t s_wb_collisions;
1484 +       atomic_t s_wb_blocks;
1485 +       atomic_t s_wb_extents;
1486 +       atomic_t s_wb_dropped;
1487  };
1488
1489  #endif /* _LINUX_EXT4_FS_SB */