fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5  */
   6
   7 /*
   8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9  *
  10  * Removed a lot of unnecessary code and simplified things now that
  11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12  *
  13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15  *
  16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17  *
  18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19  */
  20
  21 #include <linux/kernel.h>
  22 #include <linux/syscalls.h>
  23 #include <linux/fs.h>
  24 #include <linux/mm.h>
  25 #include <linux/percpu.h>
  26 #include <linux/slab.h>
  27 #include <linux/smp_lock.h>
  28 #include <linux/capability.h>
  29 #include <linux/blkdev.h>
  30 #include <linux/file.h>
  31 #include <linux/quotaops.h>
  32 #include <linux/highmem.h>
  33 #include <linux/module.h>
  34 #include <linux/writeback.h>
  35 #include <linux/hash.h>
  36 #include <linux/suspend.h>
  37 #include <linux/buffer_head.h>
  38 #include <linux/bio.h>
  39 #include <linux/notifier.h>
  40 #include <linux/cpu.h>
  41 #include <linux/bitops.h>
  42 #include <linux/mpage.h>
  43 #include <linux/bit_spinlock.h>
  44
  45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  46 static void invalidate_bh_lrus(void);
  47
  48 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  49
  50 inline void
  51 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  52 {
  53         bh->b_end_io = handler;
  54         bh->b_private = private;
  55 }
  56
  57 static int sync_buffer(void *word)
  58 {
  59         struct block_device *bd;
  60         struct buffer_head *bh
  61                 = container_of(word, struct buffer_head, b_state);
  62
  63         smp_mb();
  64         bd = bh->b_bdev;
  65         if (bd)
  66                 blk_run_address_space(bd->bd_inode->i_mapping);
  67         io_schedule();
  68         return 0;
  69 }
  70
  71 void fastcall __lock_buffer(struct buffer_head *bh)
  72 {
  73         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  74                                                         TASK_UNINTERRUPTIBLE);
  75 }
  76 EXPORT_SYMBOL(__lock_buffer);
  77
  78 void fastcall unlock_buffer(struct buffer_head *bh)
  79 {
  80         clear_buffer_locked(bh);
  81         smp_mb__after_clear_bit();
  82         wake_up_bit(&bh->b_state, BH_Lock);
  83 }
  84
  85 /*
  86  * Block until a buffer comes unlocked.  This doesn't stop it
  87  * from becoming locked again - you have to lock it yourself
  88  * if you want to preserve its state.
  89  */
  90 void __wait_on_buffer(struct buffer_head * bh)
  91 {
  92         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  93 }
  94
  95 static void
  96 __clear_page_buffers(struct page *page)
  97 {
  98         ClearPagePrivate(page);
  99         set_page_private(page, 0);
 100         page_cache_release(page);
 101 }
 102
 103 static void buffer_io_error(struct buffer_head *bh)
 104 {
 105         char b[BDEVNAME_SIZE];
 106
 107         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 108                         bdevname(bh->b_bdev, b),
 109                         (unsigned long long)bh->b_blocknr);
 110 }
 111
 112 /*
 113  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 114  * unlock the buffer. This is what ll_rw_block uses too.
 115  */
 116 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 117 {
 118         if (uptodate) {
 119                 set_buffer_uptodate(bh);
 120         } else {
 121                 /* This happens, due to failed READA attempts. */
 122                 clear_buffer_uptodate(bh);
 123         }
 124         unlock_buffer(bh);
 125         put_bh(bh);
 126 }
 127
 128 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 129 {
 130         char b[BDEVNAME_SIZE];
 131
 132         if (uptodate) {
 133                 set_buffer_uptodate(bh);
 134         } else {
 135                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 136                         buffer_io_error(bh);
 137                         printk(KERN_WARNING "lost page write due to "
 138                                         "I/O error on %s\n",
 139                                        bdevname(bh->b_bdev, b));
 140                 }
 141                 set_buffer_write_io_error(bh);
 142                 clear_buffer_uptodate(bh);
 143         }
 144         unlock_buffer(bh);
 145         put_bh(bh);
 146 }
 147
 148 /*
 149  * Write out and wait upon all the dirty data associated with a block
 150  * device via its mapping.  Does not take the superblock lock.
 151  */
 152 int sync_blockdev(struct block_device *bdev)
 153 {
 154         int ret = 0;
 155
 156         if (bdev)
 157                 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 158         return ret;
 159 }
 160 EXPORT_SYMBOL(sync_blockdev);
 161
 162 /*
 163  * Write out and wait upon all dirty data associated with this
 164  * device.   Filesystem data as well as the underlying block
 165  * device.  Takes the superblock lock.
 166  */
 167 int fsync_bdev(struct block_device *bdev)
 168 {
 169         struct super_block *sb = get_super(bdev);
 170         if (sb) {
 171                 int res = fsync_super(sb);
 172                 drop_super(sb);
 173                 return res;
 174         }
 175         return sync_blockdev(bdev);
 176 }
 177
 178 /**
 179  * freeze_bdev  --  lock a filesystem and force it into a consistent state
 180  * @bdev:       blockdevice to lock
 181  *
 182  * This takes the block device bd_mount_mutex to make sure no new mounts
 183  * happen on bdev until thaw_bdev() is called.
 184  * If a superblock is found on this device, we take the s_umount semaphore
 185  * on it to make sure nobody unmounts until the snapshot creation is done.
 186  */
 187 struct super_block *freeze_bdev(struct block_device *bdev)
 188 {
 189         struct super_block *sb;
 190
 191         mutex_lock(&bdev->bd_mount_mutex);
 192         sb = get_super(bdev);
 193         if (sb && !(sb->s_flags & MS_RDONLY)) {
 194                 sb->s_frozen = SB_FREEZE_WRITE;
 195                 smp_wmb();
 196
 197                 __fsync_super(sb);
 198
 199                 sb->s_frozen = SB_FREEZE_TRANS;
 200                 smp_wmb();
 201
 202                 sync_blockdev(sb->s_bdev);
 203
 204                 if (sb->s_op->write_super_lockfs)
 205                         sb->s_op->write_super_lockfs(sb);
 206         }
 207
 208         sync_blockdev(bdev);
 209         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 210 }
 211 EXPORT_SYMBOL(freeze_bdev);
 212
 213 /**
 214  * thaw_bdev  -- unlock filesystem
 215  * @bdev:       blockdevice to unlock
 216  * @sb:         associated superblock
 217  *
 218  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 219  */
 220 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 221 {
 222         if (sb) {
 223                 BUG_ON(sb->s_bdev != bdev);
 224
 225                 if (sb->s_op->unlockfs)
 226                         sb->s_op->unlockfs(sb);
 227                 sb->s_frozen = SB_UNFROZEN;
 228                 smp_wmb();
 229                 wake_up(&sb->s_wait_unfrozen);
 230                 drop_super(sb);
 231         }
 232
 233         mutex_unlock(&bdev->bd_mount_mutex);
 234 }
 235 EXPORT_SYMBOL(thaw_bdev);
 236
 237 /*
 238  * Various filesystems appear to want __find_get_block to be non-blocking.
 239  * But it's the page lock which protects the buffers.  To get around this,
 240  * we get exclusion from try_to_free_buffers with the blockdev mapping's
 241  * private_lock.
 242  *
 243  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 244  * may be quite high.  This code could TryLock the page, and if that
 245  * succeeds, there is no need to take private_lock. (But if
 246  * private_lock is contended then so is mapping->tree_lock).
 247  */
 248 static struct buffer_head *
 249 __find_get_block_slow(struct block_device *bdev, sector_t block)
 250 {
 251         struct inode *bd_inode = bdev->bd_inode;
 252         struct address_space *bd_mapping = bd_inode->i_mapping;
 253         struct buffer_head *ret = NULL;
 254         pgoff_t index;
 255         struct buffer_head *bh;
 256         struct buffer_head *head;
 257         struct page *page;
 258         int all_mapped = 1;
 259
 260         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 261         page = find_get_page(bd_mapping, index);
 262         if (!page)
 263                 goto out;
 264
 265         spin_lock(&bd_mapping->private_lock);
 266         if (!page_has_buffers(page))
 267                 goto out_unlock;
 268         head = page_buffers(page);
 269         bh = head;
 270         do {
 271                 if (bh->b_blocknr == block) {
 272                         ret = bh;
 273                         get_bh(bh);
 274                         goto out_unlock;
 275                 }
 276                 if (!buffer_mapped(bh))
 277                         all_mapped = 0;
 278                 bh = bh->b_this_page;
 279         } while (bh != head);
 280
 281         /* we might be here because some of the buffers on this page are
 282          * not mapped.  This is due to various races between
 283          * file io on the block device and getblk.  It gets dealt with
 284          * elsewhere, don't buffer_error if we had some unmapped buffers
 285          */
 286         if (all_mapped) {
 287                 printk("__find_get_block_slow() failed. "
 288                         "block=%llu, b_blocknr=%llu\n",
 289                         (unsigned long long)block,
 290                         (unsigned long long)bh->b_blocknr);
 291 #if 0   // mask by Victor Yu. 02-12-2007
 292                 printk("b_state=0x%08lx, b_size=%zu\n",
 293                         bh->b_state, bh->b_size);
 294 #else
 295                 printk("b_state=0x%08lx, b_size=%u\n",
 296                         bh->b_state, bh->b_size);
 297 #endif
 298                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 299         }
 300 out_unlock:
 301         spin_unlock(&bd_mapping->private_lock);
 302         page_cache_release(page);
 303 out:
 304         return ret;
 305 }
 306
 307 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 308    of fs corruption is going on. Trashing dirty data always imply losing
 309    information that was supposed to be just stored on the physical layer
 310    by the user.
 311
 312    Thus invalidate_buffers in general usage is not allwowed to trash
 313    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 314    be preserved.  These buffers are simply skipped.
 315
 316    We also skip buffers which are still in use.  For example this can
 317    happen if a userspace program is reading the block device.
 318
 319    NOTE: In the case where the user removed a removable-media-disk even if
 320    there's still dirty data not synced on disk (due a bug in the device driver
 321    or due an error of the user), by not destroying the dirty buffers we could
 322    generate corruption also on the next media inserted, thus a parameter is
 323    necessary to handle this case in the most safe way possible (trying
 324    to not corrupt also the new disk inserted with the data belonging to
 325    the old now corrupted disk). Also for the ramdisk the natural thing
 326    to do in order to release the ramdisk memory is to destroy dirty buffers.
 327
 328    These are two special cases. Normal usage imply the device driver
 329    to issue a sync on the device (without waiting I/O completion) and
 330    then an invalidate_buffers call that doesn't trash dirty buffers.
 331
 332    For handling cache coherency with the blkdev pagecache the 'update' case
 333    is been introduced. It is needed to re-read from disk any pinned
 334    buffer. NOTE: re-reading from disk is destructive so we can do it only
 335    when we assume nobody is changing the buffercache under our I/O and when
 336    we think the disk contains more recent information than the buffercache.
 337    The update == 1 pass marks the buffers we need to update, the update == 2
 338    pass does the actual I/O. */
 339 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 340 {
 341         struct address_space *mapping = bdev->bd_inode->i_mapping;
 342
 343         if (mapping->nrpages == 0)
 344                 return;
 345
 346         invalidate_bh_lrus();
 347         /*
 348          * FIXME: what about destroy_dirty_buffers?
 349          * We really want to use invalidate_inode_pages2() for
 350          * that, but not until that's cleaned up.
 351          */
 352         invalidate_inode_pages(mapping);
 353 }
 354
 355 /*
 356  * Kick pdflush then try to free up some ZONE_NORMAL memory.
 357  */
 358 static void free_more_memory(void)
 359 {
 360         struct zone **zones;
 361         pg_data_t *pgdat;
 362
 363         wakeup_pdflush(1024);
 364         yield();
 365
 366         for_each_online_pgdat(pgdat) {
 367                 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 368                 if (*zones)
 369                         try_to_free_pages(zones, GFP_NOFS);
 370         }
 371 }
 372
 373 /*
 374  * I/O completion handler for block_read_full_page() - pages
 375  * which come unlocked at the end of I/O.
 376  */
 377 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 378 {
 379         unsigned long flags;
 380         struct buffer_head *first;
 381         struct buffer_head *tmp;
 382         struct page *page;
 383         int page_uptodate = 1;
 384
 385         BUG_ON(!buffer_async_read(bh));
 386
 387         page = bh->b_page;
 388         if (uptodate) {
 389                 set_buffer_uptodate(bh);
 390         } else {
 391                 clear_buffer_uptodate(bh);
 392                 if (printk_ratelimit())
 393                         buffer_io_error(bh);
 394                 SetPageError(page);
 395         }
 396
 397         /*
 398          * Be _very_ careful from here on. Bad things can happen if
 399          * two buffer heads end IO at almost the same time and both
 400          * decide that the page is now completely done.
 401          */
 402         first = page_buffers(page);
 403         local_irq_save(flags);
 404         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 405         clear_buffer_async_read(bh);
 406         unlock_buffer(bh);
 407         tmp = bh;
 408         do {
 409                 if (!buffer_uptodate(tmp))
 410                         page_uptodate = 0;
 411                 if (buffer_async_read(tmp)) {
 412                         BUG_ON(!buffer_locked(tmp));
 413                         goto still_busy;
 414                 }
 415                 tmp = tmp->b_this_page;
 416         } while (tmp != bh);
 417         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 418         local_irq_restore(flags);
 419
 420         /*
 421          * If none of the buffers had errors and they are all
 422          * uptodate then we can set the page uptodate.
 423          */
 424         if (page_uptodate && !PageError(page))
 425                 SetPageUptodate(page);
 426         unlock_page(page);
 427         return;
 428
 429 still_busy:
 430         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 431         local_irq_restore(flags);
 432         return;
 433 }
 434
 435 /*
 436  * Completion handler for block_write_full_page() - pages which are unlocked
 437  * during I/O, and which have PageWriteback cleared upon I/O completion.
 438  */
 439 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 440 {
 441         char b[BDEVNAME_SIZE];
 442         unsigned long flags;
 443         struct buffer_head *first;
 444         struct buffer_head *tmp;
 445         struct page *page;
 446
 447         BUG_ON(!buffer_async_write(bh));
 448
 449         page = bh->b_page;
 450         if (uptodate) {
 451                 set_buffer_uptodate(bh);
 452         } else {
 453                 if (printk_ratelimit()) {
 454                         buffer_io_error(bh);
 455                         printk(KERN_WARNING "lost page write due to "
 456                                         "I/O error on %s\n",
 457                                bdevname(bh->b_bdev, b));
 458                 }
 459 #if 0   // mask by Victor Yu. 02-12-2007
 460                 set_bit(AS_EIO, &page->mapping->flags);
 461 #else
 462                 set_bit(AS_EIO, &page->u.xx.mapping->flags);
 463 #endif
 464                 set_buffer_write_io_error(bh);
 465                 clear_buffer_uptodate(bh);
 466                 SetPageError(page);
 467         }
 468
 469         first = page_buffers(page);
 470         local_irq_save(flags);
 471         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 472
 473         clear_buffer_async_write(bh);
 474         unlock_buffer(bh);
 475         tmp = bh->b_this_page;
 476         while (tmp != bh) {
 477                 if (buffer_async_write(tmp)) {
 478                         BUG_ON(!buffer_locked(tmp));
 479                         goto still_busy;
 480                 }
 481                 tmp = tmp->b_this_page;
 482         }
 483         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 484         local_irq_restore(flags);
 485         end_page_writeback(page);
 486         return;
 487
 488 still_busy:
 489         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 490         local_irq_restore(flags);
 491         return;
 492 }
 493
 494 /*
 495  * If a page's buffers are under async readin (end_buffer_async_read
 496  * completion) then there is a possibility that another thread of
 497  * control could lock one of the buffers after it has completed
 498  * but while some of the other buffers have not completed.  This
 499  * locked buffer would confuse end_buffer_async_read() into not unlocking
 500  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 501  * that this buffer is not under async I/O.
 502  *
 503  * The page comes unlocked when it has no locked buffer_async buffers
 504  * left.
 505  *
 506  * PageLocked prevents anyone starting new async I/O reads any of
 507  * the buffers.
 508  *
 509  * PageWriteback is used to prevent simultaneous writeout of the same
 510  * page.
 511  *
 512  * PageLocked prevents anyone from starting writeback of a page which is
 513  * under read I/O (PageWriteback is only ever set against a locked page).
 514  */
 515 static void mark_buffer_async_read(struct buffer_head *bh)
 516 {
 517         bh->b_end_io = end_buffer_async_read;
 518         set_buffer_async_read(bh);
 519 }
 520
 521 void mark_buffer_async_write(struct buffer_head *bh)
 522 {
 523         bh->b_end_io = end_buffer_async_write;
 524         set_buffer_async_write(bh);
 525 }
 526 EXPORT_SYMBOL(mark_buffer_async_write);
 527
 528
 529 /*
 530  * fs/buffer.c contains helper functions for buffer-backed address space's
 531  * fsync functions.  A common requirement for buffer-based filesystems is
 532  * that certain data from the backing blockdev needs to be written out for
 533  * a successful fsync().  For example, ext2 indirect blocks need to be
 534  * written back and waited upon before fsync() returns.
 535  *
 536  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 537  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 538  * management of a list of dependent buffers at ->i_mapping->private_list.
 539  *
 540  * Locking is a little subtle: try_to_free_buffers() will remove buffers
 541  * from their controlling inode's queue when they are being freed.  But
 542  * try_to_free_buffers() will be operating against the *blockdev* mapping
 543  * at the time, not against the S_ISREG file which depends on those buffers.
 544  * So the locking for private_list is via the private_lock in the address_space
 545  * which backs the buffers.  Which is different from the address_space
 546  * against which the buffers are listed.  So for a particular address_space,
 547  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 548  * mapping->private_list will always be protected by the backing blockdev's
 549  * ->private_lock.
 550  *
 551  * Which introduces a requirement: all buffers on an address_space's
 552  * ->private_list must be from the same address_space: the blockdev's.
 553  *
 554  * address_spaces which do not place buffers at ->private_list via these
 555  * utility functions are free to use private_lock and private_list for
 556  * whatever they want.  The only requirement is that list_empty(private_list)
 557  * be true at clear_inode() time.
 558  *
 559  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 560  * filesystems should do that.  invalidate_inode_buffers() should just go
 561  * BUG_ON(!list_empty).
 562  *
 563  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 564  * take an address_space, not an inode.  And it should be called
 565  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 566  * queued up.
 567  *
 568  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 569  * list if it is already on a list.  Because if the buffer is on a list,
 570  * it *must* already be on the right one.  If not, the filesystem is being
 571  * silly.  This will save a ton of locking.  But first we have to ensure
 572  * that buffers are taken *off* the old inode's list when they are freed
 573  * (presumably in truncate).  That requires careful auditing of all
 574  * filesystems (do it inside bforget()).  It could also be done by bringing
 575  * b_inode back.
 576  */
 577
 578 /*
 579  * The buffer's backing address_space's private_lock must be held
 580  */
 581 static inline void __remove_assoc_queue(struct buffer_head *bh)
 582 {
 583         list_del_init(&bh->b_assoc_buffers);
 584         WARN_ON(!bh->b_assoc_map);
 585         if (buffer_write_io_error(bh))
 586                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
 587         bh->b_assoc_map = NULL;
 588 }
 589
 590 int inode_has_buffers(struct inode *inode)
 591 {
 592         return !list_empty(&inode->i_data.private_list);
 593 }
 594
 595 /*
 596  * osync is designed to support O_SYNC io.  It waits synchronously for
 597  * all already-submitted IO to complete, but does not queue any new
 598  * writes to the disk.
 599  *
 600  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 601  * you dirty the buffers, and then use osync_inode_buffers to wait for
 602  * completion.  Any other dirty buffers which are not yet queued for
 603  * write will not be flushed to disk by the osync.
 604  */
 605 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 606 {
 607         struct buffer_head *bh;
 608         struct list_head *p;
 609         int err = 0;
 610
 611         spin_lock(lock);
 612 repeat:
 613         list_for_each_prev(p, list) {
 614                 bh = BH_ENTRY(p);
 615                 if (buffer_locked(bh)) {
 616                         get_bh(bh);
 617                         spin_unlock(lock);
 618                         wait_on_buffer(bh);
 619                         if (!buffer_uptodate(bh))
 620                                 err = -EIO;
 621                         brelse(bh);
 622                         spin_lock(lock);
 623                         goto repeat;
 624                 }
 625         }
 626         spin_unlock(lock);
 627         return err;
 628 }
 629
 630 /**
 631  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 632  *                        buffers
 633  * @mapping: the mapping which wants those buffers written
 634  *
 635  * Starts I/O against the buffers at mapping->private_list, and waits upon
 636  * that I/O.
 637  *
 638  * Basically, this is a convenience function for fsync().
 639  * @mapping is a file or directory which needs those buffers to be written for
 640  * a successful fsync().
 641  */
 642 int sync_mapping_buffers(struct address_space *mapping)
 643 {
 644         struct address_space *buffer_mapping = mapping->assoc_mapping;
 645
 646         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 647                 return 0;
 648
 649         return fsync_buffers_list(&buffer_mapping->private_lock,
 650                                         &mapping->private_list);
 651 }
 652 EXPORT_SYMBOL(sync_mapping_buffers);
 653
 654 /*
 655  * Called when we've recently written block `bblock', and it is known that
 656  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 657  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 658  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 659  */
 660 void write_boundary_block(struct block_device *bdev,
 661                         sector_t bblock, unsigned blocksize)
 662 {
 663         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 664         if (bh) {
 665                 if (buffer_dirty(bh))
 666                         ll_rw_block(WRITE, 1, &bh);
 667                 put_bh(bh);
 668         }
 669 }
 670
 671 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 672 {
 673         struct address_space *mapping = inode->i_mapping;
 674 #if 0   // mask by Victor Yu. 02-12-2007
 675         struct address_space *buffer_mapping = bh->b_page->mapping;
 676 #else
 677         struct address_space *buffer_mapping = bh->b_page->u.xx.mapping;
 678 #endif
 679
 680         mark_buffer_dirty(bh);
 681         if (!mapping->assoc_mapping) {
 682                 mapping->assoc_mapping = buffer_mapping;
 683         } else {
 684                 BUG_ON(mapping->assoc_mapping != buffer_mapping);
 685         }
 686         if (list_empty(&bh->b_assoc_buffers)) {
 687                 spin_lock(&buffer_mapping->private_lock);
 688                 list_move_tail(&bh->b_assoc_buffers,
 689                                 &mapping->private_list);
 690                 bh->b_assoc_map = mapping;
 691                 spin_unlock(&buffer_mapping->private_lock);
 692         }
 693 }
 694 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 695
 696 /*
 697  * Add a page to the dirty page list.
 698  *
 699  * It is a sad fact of life that this function is called from several places
 700  * deeply under spinlocking.  It may not sleep.
 701  *
 702  * If the page has buffers, the uptodate buffers are set dirty, to preserve
 703  * dirty-state coherency between the page and the buffers.  It the page does
 704  * not have buffers then when they are later attached they will all be set
 705  * dirty.
 706  *
 707  * The buffers are dirtied before the page is dirtied.  There's a small race
 708  * window in which a writepage caller may see the page cleanness but not the
 709  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 710  * before the buffers, a concurrent writepage caller could clear the page dirty
 711  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 712  * page on the dirty page list.
 713  *
 714  * We use private_lock to lock against try_to_free_buffers while using the
 715  * page's buffer list.  Also use this to protect against clean buffers being
 716  * added to the page after it was set dirty.
 717  *
 718  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 719  * address_space though.
 720  */
 721 int __set_page_dirty_buffers(struct page *page)
 722 {
 723         struct address_space * const mapping = page_mapping(page);
 724
 725         if (unlikely(!mapping))
 726                 return !TestSetPageDirty(page);
 727
 728         spin_lock(&mapping->private_lock);
 729         if (page_has_buffers(page)) {
 730                 struct buffer_head *head = page_buffers(page);
 731                 struct buffer_head *bh = head;
 732
 733                 do {
 734                         set_buffer_dirty(bh);
 735                         bh = bh->b_this_page;
 736                 } while (bh != head);
 737         }
 738         spin_unlock(&mapping->private_lock);
 739
 740         if (!TestSetPageDirty(page)) {
 741                 write_lock_irq(&mapping->tree_lock);
 742 #if 0   // mask by Victor Yu. 02-12-2007
 743                 if (page->mapping) {    /* Race with truncate? */
 744 #else
 745                 if (page->u.xx.mapping) {       /* Race with truncate? */
 746 #endif
 747                         if (mapping_cap_account_dirty(mapping))
 748                                 __inc_zone_page_state(page, NR_FILE_DIRTY);
 749                         radix_tree_tag_set(&mapping->page_tree,
 750                                                 page_index(page),
 751                                                 PAGECACHE_TAG_DIRTY);
 752                 }
 753                 write_unlock_irq(&mapping->tree_lock);
 754                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 755                 return 1;
 756         }
 757         return 0;
 758 }
 759 EXPORT_SYMBOL(__set_page_dirty_buffers);
 760
 761 /*
 762  * Write out and wait upon a list of buffers.
 763  *
 764  * We have conflicting pressures: we want to make sure that all
 765  * initially dirty buffers get waited on, but that any subsequently
 766  * dirtied buffers don't.  After all, we don't want fsync to last
 767  * forever if somebody is actively writing to the file.
 768  *
 769  * Do this in two main stages: first we copy dirty buffers to a
 770  * temporary inode list, queueing the writes as we go.  Then we clean
 771  * up, waiting for those writes to complete.
 772  *
 773  * During this second stage, any subsequent updates to the file may end
 774  * up refiling the buffer on the original inode's dirty list again, so
 775  * there is a chance we will end up with a buffer queued for write but
 776  * not yet completed on that list.  So, as a final cleanup we go through
 777  * the osync code to catch these locked, dirty buffers without requeuing
 778  * any newly dirty buffers for write.
 779  */
 780 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 781 {
 782         struct buffer_head *bh;
 783         struct list_head tmp;
 784         int err = 0, err2;
 785
 786         INIT_LIST_HEAD(&tmp);
 787
 788         spin_lock(lock);
 789         while (!list_empty(list)) {
 790                 bh = BH_ENTRY(list->next);
 791                 __remove_assoc_queue(bh);
 792                 if (buffer_dirty(bh) || buffer_locked(bh)) {
 793                         list_add(&bh->b_assoc_buffers, &tmp);
 794                         if (buffer_dirty(bh)) {
 795                                 get_bh(bh);
 796                                 spin_unlock(lock);
 797                                 /*
 798                                  * Ensure any pending I/O completes so that
 799                                  * ll_rw_block() actually writes the current
 800                                  * contents - it is a noop if I/O is still in
 801                                  * flight on potentially older contents.
 802                                  */
 803                                 ll_rw_block(SWRITE, 1, &bh);
 804                                 brelse(bh);
 805                                 spin_lock(lock);
 806                         }
 807                 }
 808         }
 809
 810         while (!list_empty(&tmp)) {
 811                 bh = BH_ENTRY(tmp.prev);
 812                 list_del_init(&bh->b_assoc_buffers);
 813                 get_bh(bh);
 814                 spin_unlock(lock);
 815                 wait_on_buffer(bh);
 816                 if (!buffer_uptodate(bh))
 817                         err = -EIO;
 818                 brelse(bh);
 819                 spin_lock(lock);
 820         }
 821
 822         spin_unlock(lock);
 823         err2 = osync_buffers_list(lock, list);
 824         if (err)
 825                 return err;
 826         else
 827                 return err2;
 828 }
 829
 830 /*
 831  * Invalidate any and all dirty buffers on a given inode.  We are
 832  * probably unmounting the fs, but that doesn't mean we have already
 833  * done a sync().  Just drop the buffers from the inode list.
 834  *
 835  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 836  * assumes that all the buffers are against the blockdev.  Not true
 837  * for reiserfs.
 838  */
 839 void invalidate_inode_buffers(struct inode *inode)
 840 {
 841         if (inode_has_buffers(inode)) {
 842                 struct address_space *mapping = &inode->i_data;
 843                 struct list_head *list = &mapping->private_list;
 844                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 845
 846                 spin_lock(&buffer_mapping->private_lock);
 847                 while (!list_empty(list))
 848                         __remove_assoc_queue(BH_ENTRY(list->next));
 849                 spin_unlock(&buffer_mapping->private_lock);
 850         }
 851 }
 852
 853 /*
 854  * Remove any clean buffers from the inode's buffer list.  This is called
 855  * when we're trying to free the inode itself.  Those buffers can pin it.
 856  *
 857  * Returns true if all buffers were removed.
 858  */
 859 int remove_inode_buffers(struct inode *inode)
 860 {
 861         int ret = 1;
 862
 863         if (inode_has_buffers(inode)) {
 864                 struct address_space *mapping = &inode->i_data;
 865                 struct list_head *list = &mapping->private_list;
 866                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 867
 868                 spin_lock(&buffer_mapping->private_lock);
 869                 while (!list_empty(list)) {
 870                         struct buffer_head *bh = BH_ENTRY(list->next);
 871                         if (buffer_dirty(bh)) {
 872                                 ret = 0;
 873                                 break;
 874                         }
 875                         __remove_assoc_queue(bh);
 876                 }
 877                 spin_unlock(&buffer_mapping->private_lock);
 878         }
 879         return ret;
 880 }
 881
 882 /*
 883  * Create the appropriate buffers when given a page for data area and
 884  * the size of each buffer.. Use the bh->b_this_page linked list to
 885  * follow the buffers created.  Return NULL if unable to create more
 886  * buffers.
 887  *
 888  * The retry flag is used to differentiate async IO (paging, swapping)
 889  * which may not fail from ordinary buffer allocations.
 890  */
 891 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 892                 int retry)
 893 {
 894         struct buffer_head *bh, *head;
 895         long offset;
 896
 897 try_again:
 898         head = NULL;
 899         offset = PAGE_SIZE;
 900         while ((offset -= size) >= 0) {
 901                 bh = alloc_buffer_head(GFP_NOFS);
 902                 if (!bh)
 903                         goto no_grow;
 904
 905                 bh->b_bdev = NULL;
 906                 bh->b_this_page = head;
 907                 bh->b_blocknr = -1;
 908                 head = bh;
 909
 910                 bh->b_state = 0;
 911                 atomic_set(&bh->b_count, 0);
 912                 bh->b_private = NULL;
 913                 bh->b_size = size;
 914
 915                 /* Link the buffer to its page */
 916                 set_bh_page(bh, page, offset);
 917
 918                 init_buffer(bh, NULL, NULL);
 919         }
 920         return head;
 921 /*
 922  * In case anything failed, we just free everything we got.
 923  */
 924 no_grow:
 925         if (head) {
 926                 do {
 927                         bh = head;
 928                         head = head->b_this_page;
 929                         free_buffer_head(bh);
 930                 } while (head);
 931         }
 932
 933         /*
 934          * Return failure for non-async IO requests.  Async IO requests
 935          * are not allowed to fail, so we have to wait until buffer heads
 936          * become available.  But we don't want tasks sleeping with
 937          * partially complete buffers, so all were released above.
 938          */
 939         if (!retry)
 940                 return NULL;
 941
 942         /* We're _really_ low on memory. Now we just
 943          * wait for old buffer heads to become free due to
 944          * finishing IO.  Since this is an async request and
 945          * the reserve list is empty, we're sure there are
 946          * async buffer heads in use.
 947          */
 948         free_more_memory();
 949         goto try_again;
 950 }
 951 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 952
 953 static inline void
 954 link_dev_buffers(struct page *page, struct buffer_head *head)
 955 {
 956         struct buffer_head *bh, *tail;
 957
 958         bh = head;
 959         do {
 960                 tail = bh;
 961                 bh = bh->b_this_page;
 962         } while (bh);
 963         tail->b_this_page = head;
 964         attach_page_buffers(page, head);
 965 }
 966
 967 /*
 968  * Initialise the state of a blockdev page's buffers.
 969  */
 970 static void
 971 init_page_buffers(struct page *page, struct block_device *bdev,
 972                         sector_t block, int size)
 973 {
 974         struct buffer_head *head = page_buffers(page);
 975         struct buffer_head *bh = head;
 976         int uptodate = PageUptodate(page);
 977
 978         do {
 979                 if (!buffer_mapped(bh)) {
 980                         init_buffer(bh, NULL, NULL);
 981                         bh->b_bdev = bdev;
 982                         bh->b_blocknr = block;
 983                         if (uptodate)
 984                                 set_buffer_uptodate(bh);
 985                         set_buffer_mapped(bh);
 986                 }
 987                 block++;
 988                 bh = bh->b_this_page;
 989         } while (bh != head);
 990 }
 991
 992 /*
 993  * Create the page-cache page that contains the requested block.
 994  *
 995  * This is user purely for blockdev mappings.
 996  */
 997 static struct page *
 998 grow_dev_page(struct block_device *bdev, sector_t block,
 999                 pgoff_t index, int size)
1000 {
1001         struct inode *inode = bdev->bd_inode;
1002         struct page *page;
1003         struct buffer_head *bh;
1004
1005         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1006         if (!page)
1007                 return NULL;
1008
1009         BUG_ON(!PageLocked(page));
1010
1011         if (page_has_buffers(page)) {
1012                 bh = page_buffers(page);
1013                 if (bh->b_size == size) {
1014                         init_page_buffers(page, bdev, block, size);
1015                         return page;
1016                 }
1017                 if (!try_to_free_buffers(page))
1018                         goto failed;
1019         }
1020
1021         /*
1022          * Allocate some buffers for this page
1023          */
1024         bh = alloc_page_buffers(page, size, 0);
1025         if (!bh)
1026                 goto failed;
1027
1028         /*
1029          * Link the page to the buffers and initialise them.  Take the
1030          * lock to be atomic wrt __find_get_block(), which does not
1031          * run under the page lock.
1032          */
1033         spin_lock(&inode->i_mapping->private_lock);
1034         link_dev_buffers(page, bh);
1035         init_page_buffers(page, bdev, block, size);
1036         spin_unlock(&inode->i_mapping->private_lock);
1037         return page;
1038
1039 failed:
1040         BUG();
1041         unlock_page(page);
1042         page_cache_release(page);
1043         return NULL;
1044 }
1045
1046 /*
1047  * Create buffers for the specified block device block's page.  If
1048  * that page was dirty, the buffers are set dirty also.
1049  *
1050  * Except that's a bug.  Attaching dirty buffers to a dirty
1051  * blockdev's page can result in filesystem corruption, because
1052  * some of those buffers may be aliases of filesystem data.
1053  * grow_dev_page() will go BUG() if this happens.
1054  */
1055 static int
1056 grow_buffers(struct block_device *bdev, sector_t block, int size)
1057 {
1058         struct page *page;
1059         pgoff_t index;
1060         int sizebits;
1061
1062         sizebits = -1;
1063         do {
1064                 sizebits++;
1065         } while ((size << sizebits) < PAGE_SIZE);
1066
1067         index = block >> sizebits;
1068
1069         /*
1070          * Check for a block which wants to lie outside our maximum possible
1071          * pagecache index.  (this comparison is done using sector_t types).
1072          */
1073         if (unlikely(index != block >> sizebits)) {
1074                 char b[BDEVNAME_SIZE];
1075
1076                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1077                         "device %s\n",
1078                         __FUNCTION__, (unsigned long long)block,
1079                         bdevname(bdev, b));
1080                 return -EIO;
1081         }
1082         block = index << sizebits;
1083         /* Create a page with the proper size buffers.. */
1084         page = grow_dev_page(bdev, block, index, size);
1085         if (!page)
1086                 return 0;
1087         unlock_page(page);
1088         page_cache_release(page);
1089         return 1;
1090 }
1091
1092 static struct buffer_head *
1093 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1094 {
1095         /* Size must be multiple of hard sectorsize */
1096         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1097                         (size < 512 || size > PAGE_SIZE))) {
1098                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1099                                         size);
1100                 printk(KERN_ERR "hardsect size: %d\n",
1101                                         bdev_hardsect_size(bdev));
1102
1103                 dump_stack();
1104                 return NULL;
1105         }
1106
1107         for (;;) {
1108                 struct buffer_head * bh;
1109                 int ret;
1110
1111                 bh = __find_get_block(bdev, block, size);
1112                 if (bh)
1113                         return bh;
1114
1115                 ret = grow_buffers(bdev, block, size);
1116                 if (ret < 0)
1117                         return NULL;
1118                 if (ret == 0)
1119                         free_more_memory();
1120         }
1121 }
1122
1123 /*
1124  * The relationship between dirty buffers and dirty pages:
1125  *
1126  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1127  * the page is tagged dirty in its radix tree.
1128  *
1129  * At all times, the dirtiness of the buffers represents the dirtiness of
1130  * subsections of the page.  If the page has buffers, the page dirty bit is
1131  * merely a hint about the true dirty state.
1132  *
1133  * When a page is set dirty in its entirety, all its buffers are marked dirty
1134  * (if the page has buffers).
1135  *
1136  * When a buffer is marked dirty, its page is dirtied, but the page's other
1137  * buffers are not.
1138  *
1139  * Also.  When blockdev buffers are explicitly read with bread(), they
1140  * individually become uptodate.  But their backing page remains not
1141  * uptodate - even if all of its buffers are uptodate.  A subsequent
1142  * block_read_full_page() against that page will discover all the uptodate
1143  * buffers, will set the page uptodate and will perform no I/O.
1144  */
1145
1146 /**
1147  * mark_buffer_dirty - mark a buffer_head as needing writeout
1148  * @bh: the buffer_head to mark dirty
1149  *
1150  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1151  * backing page dirty, then tag the page as dirty in its address_space's radix
1152  * tree and then attach the address_space's inode to its superblock's dirty
1153  * inode list.
1154  *
1155  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1156  * mapping->tree_lock and the global inode_lock.
1157  */
1158 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1159 {
1160         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1161                 __set_page_dirty_nobuffers(bh->b_page);
1162 }
1163
1164 /*
1165  * Decrement a buffer_head's reference count.  If all buffers against a page
1166  * have zero reference count, are clean and unlocked, and if the page is clean
1167  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1168  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1169  * a page but it ends up not being freed, and buffers may later be reattached).
1170  */
1171 void __brelse(struct buffer_head * buf)
1172 {
1173         if (atomic_read(&buf->b_count)) {
1174                 put_bh(buf);
1175                 return;
1176         }
1177         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1178         WARN_ON(1);
1179 }
1180
1181 /*
1182  * bforget() is like brelse(), except it discards any
1183  * potentially dirty data.
1184  */
1185 void __bforget(struct buffer_head *bh)
1186 {
1187         clear_buffer_dirty(bh);
1188         if (!list_empty(&bh->b_assoc_buffers)) {
1189 #if 0   // mask by Victor Yu. 02-12-2007
1190                 struct address_space *buffer_mapping = bh->b_page->mapping;
1191 #else
1192                 struct address_space *buffer_mapping = bh->b_page->u.xx.mapping;
1193 #endif
1194
1195                 spin_lock(&buffer_mapping->private_lock);
1196                 list_del_init(&bh->b_assoc_buffers);
1197                 bh->b_assoc_map = NULL;
1198                 spin_unlock(&buffer_mapping->private_lock);
1199         }
1200         __brelse(bh);
1201 }
1202
1203 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1204 {
1205         lock_buffer(bh);
1206         if (buffer_uptodate(bh)) {
1207                 unlock_buffer(bh);
1208                 return bh;
1209         } else {
1210                 get_bh(bh);
1211                 bh->b_end_io = end_buffer_read_sync;
1212                 submit_bh(READ, bh);
1213                 wait_on_buffer(bh);
1214                 if (buffer_uptodate(bh))
1215                         return bh;
1216         }
1217         brelse(bh);
1218         return NULL;
1219 }
1220
1221 /*
1222  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1223  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1224  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1225  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1226  * CPU's LRUs at the same time.
1227  *
1228  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1229  * sb_find_get_block().
1230  *
1231  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1232  * a local interrupt disable for that.
1233  */
1234
1235 #define BH_LRU_SIZE     8
1236
1237 struct bh_lru {
1238         struct buffer_head *bhs[BH_LRU_SIZE];
1239 };
1240
1241 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1242
1243 #ifdef CONFIG_SMP
1244 #define bh_lru_lock()   local_irq_disable()
1245 #define bh_lru_unlock() local_irq_enable()
1246 #else
1247 #define bh_lru_lock()   preempt_disable()
1248 #define bh_lru_unlock() preempt_enable()
1249 #endif
1250
1251 static inline void check_irqs_on(void)
1252 {
1253 #ifdef irqs_disabled
1254         BUG_ON(irqs_disabled());
1255 #endif
1256 }
1257
1258 /*
1259  * The LRU management algorithm is dopey-but-simple.  Sorry.
1260  */
1261 static void bh_lru_install(struct buffer_head *bh)
1262 {
1263         struct buffer_head *evictee = NULL;
1264         struct bh_lru *lru;
1265
1266         check_irqs_on();
1267         bh_lru_lock();
1268         lru = &__get_cpu_var(bh_lrus);
1269         if (lru->bhs[0] != bh) {
1270                 struct buffer_head *bhs[BH_LRU_SIZE];
1271                 int in;
1272                 int out = 0;
1273
1274                 get_bh(bh);
1275                 bhs[out++] = bh;
1276                 for (in = 0; in < BH_LRU_SIZE; in++) {
1277                         struct buffer_head *bh2 = lru->bhs[in];
1278
1279                         if (bh2 == bh) {
1280                                 __brelse(bh2);
1281                         } else {
1282                                 if (out >= BH_LRU_SIZE) {
1283                                         BUG_ON(evictee != NULL);
1284                                         evictee = bh2;
1285                                 } else {
1286                                         bhs[out++] = bh2;
1287                                 }
1288                         }
1289                 }
1290                 while (out < BH_LRU_SIZE)
1291                         bhs[out++] = NULL;
1292                 memcpy(lru->bhs, bhs, sizeof(bhs));
1293         }
1294         bh_lru_unlock();
1295
1296         if (evictee)
1297                 __brelse(evictee);
1298 }
1299
1300 /*
1301  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1302  */
1303 static struct buffer_head *
1304 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1305 {
1306         struct buffer_head *ret = NULL;
1307         struct bh_lru *lru;
1308         int i;
1309
1310         check_irqs_on();
1311         bh_lru_lock();
1312         lru = &__get_cpu_var(bh_lrus);
1313         for (i = 0; i < BH_LRU_SIZE; i++) {
1314                 struct buffer_head *bh = lru->bhs[i];
1315
1316                 if (bh && bh->b_bdev == bdev &&
1317                                 bh->b_blocknr == block && bh->b_size == size) {
1318                         if (i) {
1319                                 while (i) {
1320                                         lru->bhs[i] = lru->bhs[i - 1];
1321                                         i--;
1322                                 }
1323                                 lru->bhs[0] = bh;
1324                         }
1325                         get_bh(bh);
1326                         ret = bh;
1327                         break;
1328                 }
1329         }
1330         bh_lru_unlock();
1331         return ret;
1332 }
1333
1334 /*
1335  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1336  * it in the LRU and mark it as accessed.  If it is not present then return
1337  * NULL
1338  */
1339 struct buffer_head *
1340 __find_get_block(struct block_device *bdev, sector_t block, int size)
1341 {
1342         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1343
1344         if (bh == NULL) {
1345                 bh = __find_get_block_slow(bdev, block);
1346                 if (bh)
1347                         bh_lru_install(bh);
1348         }
1349         if (bh)
1350                 touch_buffer(bh);
1351         return bh;
1352 }
1353 EXPORT_SYMBOL(__find_get_block);
1354
1355 /*
1356  * __getblk will locate (and, if necessary, create) the buffer_head
1357  * which corresponds to the passed block_device, block and size. The
1358  * returned buffer has its reference count incremented.
1359  *
1360  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1361  * illegal block number, __getblk() will happily return a buffer_head
1362  * which represents the non-existent block.  Very weird.
1363  *
1364  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1365  * attempt is failing.  FIXME, perhaps?
1366  */
1367 struct buffer_head *
1368 __getblk(struct block_device *bdev, sector_t block, int size)
1369 {
1370         struct buffer_head *bh = __find_get_block(bdev, block, size);
1371
1372         might_sleep();
1373         if (bh == NULL)
1374                 bh = __getblk_slow(bdev, block, size);
1375         return bh;
1376 }
1377 EXPORT_SYMBOL(__getblk);
1378
1379 /*
1380  * Do async read-ahead on a buffer..
1381  */
1382 void __breadahead(struct block_device *bdev, sector_t block, int size)
1383 {
1384         struct buffer_head *bh = __getblk(bdev, block, size);
1385         if (likely(bh)) {
1386                 ll_rw_block(READA, 1, &bh);
1387                 brelse(bh);
1388         }
1389 }
1390 EXPORT_SYMBOL(__breadahead);
1391
1392 /**
1393  *  __bread() - reads a specified block and returns the bh
1394  *  @bdev: the block_device to read from
1395  *  @block: number of block
1396  *  @size: size (in bytes) to read
1397  *
1398  *  Reads a specified block, and returns buffer head that contains it.
1399  *  It returns NULL if the block was unreadable.
1400  */
1401 struct buffer_head *
1402 __bread(struct block_device *bdev, sector_t block, int size)
1403 {
1404         struct buffer_head *bh = __getblk(bdev, block, size);
1405
1406         if (likely(bh) && !buffer_uptodate(bh))
1407                 bh = __bread_slow(bh);
1408         return bh;
1409 }
1410 EXPORT_SYMBOL(__bread);
1411
1412 /*
1413  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1414  * This doesn't race because it runs in each cpu either in irq
1415  * or with preempt disabled.
1416  */
1417 static void invalidate_bh_lru(void *arg)
1418 {
1419         struct bh_lru *b = &get_cpu_var(bh_lrus);
1420         int i;
1421
1422         for (i = 0; i < BH_LRU_SIZE; i++) {
1423                 brelse(b->bhs[i]);
1424                 b->bhs[i] = NULL;
1425         }
1426         put_cpu_var(bh_lrus);
1427 }
1428
1429 static void invalidate_bh_lrus(void)
1430 {
1431         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1432 }
1433
1434 void set_bh_page(struct buffer_head *bh,
1435                 struct page *page, unsigned long offset)
1436 {
1437         bh->b_page = page;
1438         BUG_ON(offset >= PAGE_SIZE);
1439         if (PageHighMem(page))
1440                 /*
1441                  * This catches illegal uses and preserves the offset:
1442                  */
1443                 bh->b_data = (char *)(0 + offset);
1444         else
1445                 bh->b_data = page_address(page) + offset;
1446 }
1447 EXPORT_SYMBOL(set_bh_page);
1448
1449 /*
1450  * Called when truncating a buffer on a page completely.
1451  */
1452 static void discard_buffer(struct buffer_head * bh)
1453 {
1454         lock_buffer(bh);
1455         clear_buffer_dirty(bh);
1456         bh->b_bdev = NULL;
1457         clear_buffer_mapped(bh);
1458         clear_buffer_req(bh);
1459         clear_buffer_new(bh);
1460         clear_buffer_delay(bh);
1461         unlock_buffer(bh);
1462 }
1463
1464 /**
1465  * block_invalidatepage - invalidate part of all of a buffer-backed page
1466  *
1467  * @page: the page which is affected
1468  * @offset: the index of the truncation point
1469  *
1470  * block_invalidatepage() is called when all or part of the page has become
1471  * invalidatedby a truncate operation.
1472  *
1473  * block_invalidatepage() does not have to release all buffers, but it must
1474  * ensure that no dirty buffer is left outside @offset and that no I/O
1475  * is underway against any of the blocks which are outside the truncation
1476  * point.  Because the caller is about to free (and possibly reuse) those
1477  * blocks on-disk.
1478  */
1479 void block_invalidatepage(struct page *page, unsigned long offset)
1480 {
1481         struct buffer_head *head, *bh, *next;
1482         unsigned int curr_off = 0;
1483
1484         BUG_ON(!PageLocked(page));
1485         if (!page_has_buffers(page))
1486                 goto out;
1487
1488         head = page_buffers(page);
1489         bh = head;
1490         do {
1491                 unsigned int next_off = curr_off + bh->b_size;
1492                 next = bh->b_this_page;
1493
1494                 /*
1495                  * is this block fully invalidated?
1496                  */
1497                 if (offset <= curr_off)
1498                         discard_buffer(bh);
1499                 curr_off = next_off;
1500                 bh = next;
1501         } while (bh != head);
1502
1503         /*
1504          * We release buffers only if the entire page is being invalidated.
1505          * The get_block cached value has been unconditionally invalidated,
1506          * so real IO is not possible anymore.
1507          */
1508         if (offset == 0)
1509                 try_to_release_page(page, 0);
1510 out:
1511         return;
1512 }
1513 EXPORT_SYMBOL(block_invalidatepage);
1514
1515 /*
1516  * We attach and possibly dirty the buffers atomically wrt
1517  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1518  * is already excluded via the page lock.
1519  */
1520 void create_empty_buffers(struct page *page,
1521                         unsigned long blocksize, unsigned long b_state)
1522 {
1523         struct buffer_head *bh, *head, *tail;
1524
1525         head = alloc_page_buffers(page, blocksize, 1);
1526         bh = head;
1527         do {
1528                 bh->b_state |= b_state;
1529                 tail = bh;
1530                 bh = bh->b_this_page;
1531         } while (bh);
1532         tail->b_this_page = head;
1533
1534 #if 0   // mask by Victor Yu. 02-12-2007
1535         spin_lock(&page->mapping->private_lock);
1536 #else
1537         spin_lock(&page->u.xx.mapping->private_lock);
1538 #endif
1539         if (PageUptodate(page) || PageDirty(page)) {
1540                 bh = head;
1541                 do {
1542                         if (PageDirty(page))
1543                                 set_buffer_dirty(bh);
1544                         if (PageUptodate(page))
1545                                 set_buffer_uptodate(bh);
1546                         bh = bh->b_this_page;
1547                 } while (bh != head);
1548         }
1549         attach_page_buffers(page, head);
1550 #if 0   // mask by Victor Yu. 02-12-2007
1551         spin_unlock(&page->mapping->private_lock);
1552 #else
1553         spin_unlock(&page->u.xx.mapping->private_lock);
1554 #endif
1555 }
1556 EXPORT_SYMBOL(create_empty_buffers);
1557
1558 /*
1559  * We are taking a block for data and we don't want any output from any
1560  * buffer-cache aliases starting from return from that function and
1561  * until the moment when something will explicitly mark the buffer
1562  * dirty (hopefully that will not happen until we will free that block ;-)
1563  * We don't even need to mark it not-uptodate - nobody can expect
1564  * anything from a newly allocated buffer anyway. We used to used
1565  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1566  * don't want to mark the alias unmapped, for example - it would confuse
1567  * anyone who might pick it with bread() afterwards...
1568  *
1569  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1570  * be writeout I/O going on against recently-freed buffers.  We don't
1571  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1572  * only if we really need to.  That happens here.
1573  */
1574 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1575 {
1576         struct buffer_head *old_bh;
1577
1578         might_sleep();
1579
1580         old_bh = __find_get_block_slow(bdev, block);
1581         if (old_bh) {
1582                 clear_buffer_dirty(old_bh);
1583                 wait_on_buffer(old_bh);
1584                 clear_buffer_req(old_bh);
1585                 __brelse(old_bh);
1586         }
1587 }
1588 EXPORT_SYMBOL(unmap_underlying_metadata);
1589
1590 /*
1591  * NOTE! All mapped/uptodate combinations are valid:
1592  *
1593  *      Mapped  Uptodate        Meaning
1594  *
1595  *      No      No              "unknown" - must do get_block()
1596  *      No      Yes             "hole" - zero-filled
1597  *      Yes     No              "allocated" - allocated on disk, not read in
1598  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1599  *
1600  * "Dirty" is valid only with the last case (mapped+uptodate).
1601  */
1602
1603 /*
1604  * While block_write_full_page is writing back the dirty buffers under
1605  * the page lock, whoever dirtied the buffers may decide to clean them
1606  * again at any time.  We handle that by only looking at the buffer
1607  * state inside lock_buffer().
1608  *
1609  * If block_write_full_page() is called for regular writeback
1610  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1611  * locked buffer.   This only can happen if someone has written the buffer
1612  * directly, with submit_bh().  At the address_space level PageWriteback
1613  * prevents this contention from occurring.
1614  */
1615 static int __block_write_full_page(struct inode *inode, struct page *page,
1616                         get_block_t *get_block, struct writeback_control *wbc)
1617 {
1618         int err;
1619         sector_t block;
1620         sector_t last_block;
1621         struct buffer_head *bh, *head;
1622         const unsigned blocksize = 1 << inode->i_blkbits;
1623         int nr_underway = 0;
1624
1625         BUG_ON(!PageLocked(page));
1626
1627         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1628
1629         if (!page_has_buffers(page)) {
1630                 create_empty_buffers(page, blocksize,
1631                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1632         }
1633
1634         /*
1635          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1636          * here, and the (potentially unmapped) buffers may become dirty at
1637          * any time.  If a buffer becomes dirty here after we've inspected it
1638          * then we just miss that fact, and the page stays dirty.
1639          *
1640          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1641          * handle that here by just cleaning them.
1642          */
1643
1644         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1645         head = page_buffers(page);
1646         bh = head;
1647
1648         /*
1649          * Get all the dirty buffers mapped to disk addresses and
1650          * handle any aliases from the underlying blockdev's mapping.
1651          */
1652         do {
1653                 if (block > last_block) {
1654                         /*
1655                          * mapped buffers outside i_size will occur, because
1656                          * this page can be outside i_size when there is a
1657                          * truncate in progress.
1658                          */
1659                         /*
1660                          * The buffer was zeroed by block_write_full_page()
1661                          */
1662                         clear_buffer_dirty(bh);
1663                         set_buffer_uptodate(bh);
1664                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1665                         WARN_ON(bh->b_size != blocksize);
1666                         err = get_block(inode, block, bh, 1);
1667                         if (err)
1668                                 goto recover;
1669                         if (buffer_new(bh)) {
1670                                 /* blockdev mappings never come here */
1671                                 clear_buffer_new(bh);
1672                                 unmap_underlying_metadata(bh->b_bdev,
1673                                                         bh->b_blocknr);
1674                         }
1675                 }
1676                 bh = bh->b_this_page;
1677                 block++;
1678         } while (bh != head);
1679
1680         do {
1681                 if (!buffer_mapped(bh))
1682                         continue;
1683                 /*
1684                  * If it's a fully non-blocking write attempt and we cannot
1685                  * lock the buffer then redirty the page.  Note that this can
1686                  * potentially cause a busy-wait loop from pdflush and kswapd
1687                  * activity, but those code paths have their own higher-level
1688                  * throttling.
1689                  */
1690                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1691                         lock_buffer(bh);
1692                 } else if (test_set_buffer_locked(bh)) {
1693                         redirty_page_for_writepage(wbc, page);
1694                         continue;
1695                 }
1696                 if (test_clear_buffer_dirty(bh)) {
1697                         mark_buffer_async_write(bh);
1698                 } else {
1699                         unlock_buffer(bh);
1700                 }
1701         } while ((bh = bh->b_this_page) != head);
1702
1703         /*
1704          * The page and its buffers are protected by PageWriteback(), so we can
1705          * drop the bh refcounts early.
1706          */
1707         BUG_ON(PageWriteback(page));
1708         set_page_writeback(page);
1709
1710         do {
1711                 struct buffer_head *next = bh->b_this_page;
1712                 if (buffer_async_write(bh)) {
1713                         submit_bh(WRITE, bh);
1714                         nr_underway++;
1715                 }
1716                 bh = next;
1717         } while (bh != head);
1718         unlock_page(page);
1719
1720         err = 0;
1721 done:
1722         if (nr_underway == 0) {
1723                 /*
1724                  * The page was marked dirty, but the buffers were
1725                  * clean.  Someone wrote them back by hand with
1726                  * ll_rw_block/submit_bh.  A rare case.
1727                  */
1728                 int uptodate = 1;
1729                 do {
1730                         if (!buffer_uptodate(bh)) {
1731                                 uptodate = 0;
1732                                 break;
1733                         }
1734                         bh = bh->b_this_page;
1735                 } while (bh != head);
1736                 if (uptodate)
1737                         SetPageUptodate(page);
1738                 end_page_writeback(page);
1739                 /*
1740                  * The page and buffer_heads can be released at any time from
1741                  * here on.
1742                  */
1743                 wbc->pages_skipped++;   /* We didn't write this page */
1744         }
1745         return err;
1746
1747 recover:
1748         /*
1749          * ENOSPC, or some other error.  We may already have added some
1750          * blocks to the file, so we need to write these out to avoid
1751          * exposing stale data.
1752          * The page is currently locked and not marked for writeback
1753          */
1754         bh = head;
1755         /* Recovery: lock and submit the mapped buffers */
1756         do {
1757                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1758                         lock_buffer(bh);
1759                         mark_buffer_async_write(bh);
1760                 } else {
1761                         /*
1762                          * The buffer may have been set dirty during
1763                          * attachment to a dirty page.
1764                          */
1765                         clear_buffer_dirty(bh);
1766                 }
1767         } while ((bh = bh->b_this_page) != head);
1768         SetPageError(page);
1769         BUG_ON(PageWriteback(page));
1770         set_page_writeback(page);
1771         unlock_page(page);
1772         do {
1773                 struct buffer_head *next = bh->b_this_page;
1774                 if (buffer_async_write(bh)) {
1775                         clear_buffer_dirty(bh);
1776                         submit_bh(WRITE, bh);
1777                         nr_underway++;
1778                 }
1779                 bh = next;
1780         } while (bh != head);
1781         goto done;
1782 }
1783
1784 static int __block_prepare_write(struct inode *inode, struct page *page,
1785                 unsigned from, unsigned to, get_block_t *get_block)
1786 {
1787         unsigned block_start, block_end;
1788         sector_t block;
1789         int err = 0;
1790         unsigned blocksize, bbits;
1791         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1792
1793         BUG_ON(!PageLocked(page));
1794         BUG_ON(from > PAGE_CACHE_SIZE);
1795         BUG_ON(to > PAGE_CACHE_SIZE);
1796         BUG_ON(from > to);
1797
1798         blocksize = 1 << inode->i_blkbits;
1799         if (!page_has_buffers(page))
1800                 create_empty_buffers(page, blocksize, 0);
1801         head = page_buffers(page);
1802
1803         bbits = inode->i_blkbits;
1804         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1805
1806         for(bh = head, block_start = 0; bh != head || !block_start;
1807             block++, block_start=block_end, bh = bh->b_this_page) {
1808                 block_end = block_start + blocksize;
1809                 if (block_end <= from || block_start >= to) {
1810                         if (PageUptodate(page)) {
1811                                 if (!buffer_uptodate(bh))
1812                                         set_buffer_uptodate(bh);
1813                         }
1814                         continue;
1815                 }
1816                 if (buffer_new(bh))
1817                         clear_buffer_new(bh);
1818                 if (!buffer_mapped(bh)) {
1819                         WARN_ON(bh->b_size != blocksize);
1820                         err = get_block(inode, block, bh, 1);
1821                         if (err)
1822                                 break;
1823                         if (buffer_new(bh)) {
1824                                 unmap_underlying_metadata(bh->b_bdev,
1825                                                         bh->b_blocknr);
1826                                 if (PageUptodate(page)) {
1827                                         set_buffer_uptodate(bh);
1828                                         continue;
1829                                 }
1830                                 if (block_end > to || block_start < from) {
1831                                         void *kaddr;
1832
1833                                         kaddr = kmap_atomic(page, KM_USER0);
1834                                         if (block_end > to)
1835                                                 memset(kaddr+to, 0,
1836                                                         block_end-to);
1837                                         if (block_start < from)
1838                                                 memset(kaddr+block_start,
1839                                                         0, from-block_start);
1840                                         flush_dcache_page(page);
1841                                         kunmap_atomic(kaddr, KM_USER0);
1842                                 }
1843                                 continue;
1844                         }
1845                 }
1846                 if (PageUptodate(page)) {
1847                         if (!buffer_uptodate(bh))
1848                                 set_buffer_uptodate(bh);
1849                         continue;
1850                 }
1851                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1852                      (block_start < from || block_end > to)) {
1853                         ll_rw_block(READ, 1, &bh);
1854                         *wait_bh++=bh;
1855                 }
1856         }
1857         /*
1858          * If we issued read requests - let them complete.
1859          */
1860         while(wait_bh > wait) {
1861                 wait_on_buffer(*--wait_bh);
1862                 if (!buffer_uptodate(*wait_bh))
1863                         err = -EIO;
1864         }
1865         if (!err) {
1866                 bh = head;
1867                 do {
1868                         if (buffer_new(bh))
1869                                 clear_buffer_new(bh);
1870                 } while ((bh = bh->b_this_page) != head);
1871                 return 0;
1872         }
1873         /* Error case: */
1874         /*
1875          * Zero out any newly allocated blocks to avoid exposing stale
1876          * data.  If BH_New is set, we know that the block was newly
1877          * allocated in the above loop.
1878          */
1879         bh = head;
1880         block_start = 0;
1881         do {
1882                 block_end = block_start+blocksize;
1883                 if (block_end <= from)
1884                         goto next_bh;
1885                 if (block_start >= to)
1886                         break;
1887                 if (buffer_new(bh)) {
1888                         void *kaddr;
1889
1890                         clear_buffer_new(bh);
1891                         kaddr = kmap_atomic(page, KM_USER0);
1892                         memset(kaddr+block_start, 0, bh->b_size);
1893                         flush_dcache_page(page);
1894                         kunmap_atomic(kaddr, KM_USER0);
1895                         set_buffer_uptodate(bh);
1896                         mark_buffer_dirty(bh);
1897                 }
1898 next_bh:
1899                 block_start = block_end;
1900                 bh = bh->b_this_page;
1901         } while (bh != head);
1902         return err;
1903 }
1904
1905 static int __block_commit_write(struct inode *inode, struct page *page,
1906                 unsigned from, unsigned to)
1907 {
1908         unsigned block_start, block_end;
1909         int partial = 0;
1910         unsigned blocksize;
1911         struct buffer_head *bh, *head;
1912
1913         blocksize = 1 << inode->i_blkbits;
1914
1915         for(bh = head = page_buffers(page), block_start = 0;
1916             bh != head || !block_start;
1917             block_start=block_end, bh = bh->b_this_page) {
1918                 block_end = block_start + blocksize;
1919                 if (block_end <= from || block_start >= to) {
1920                         if (!buffer_uptodate(bh))
1921                                 partial = 1;
1922                 } else {
1923                         set_buffer_uptodate(bh);
1924                         mark_buffer_dirty(bh);
1925                 }
1926         }
1927
1928         /*
1929          * If this is a partial write which happened to make all buffers
1930          * uptodate then we can optimize away a bogus readpage() for
1931          * the next read(). Here we 'discover' whether the page went
1932          * uptodate as a result of this (potentially partial) write.
1933          */
1934         if (!partial)
1935                 SetPageUptodate(page);
1936         return 0;
1937 }
1938
1939 /*
1940  * Generic "read page" function for block devices that have the normal
1941  * get_block functionality. This is most of the block device filesystems.
1942  * Reads the page asynchronously --- the unlock_buffer() and
1943  * set/clear_buffer_uptodate() functions propagate buffer state into the
1944  * page struct once IO has completed.
1945  */
1946 int block_read_full_page(struct page *page, get_block_t *get_block)
1947 {
1948 #if 0   // mask by Victor Yu. 02-12-2007
1949         struct inode *inode = page->mapping->host;
1950 #else
1951         struct inode *inode = page->u.xx.mapping->host;
1952 #endif
1953         sector_t iblock, lblock;
1954         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1955         unsigned int blocksize;
1956         int nr, i;
1957         int fully_mapped = 1;
1958
1959         BUG_ON(!PageLocked(page));
1960         blocksize = 1 << inode->i_blkbits;
1961         if (!page_has_buffers(page))
1962                 create_empty_buffers(page, blocksize, 0);
1963         head = page_buffers(page);
1964
1965         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1966         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1967         bh = head;
1968         nr = 0;
1969         i = 0;
1970
1971         do {
1972                 if (buffer_uptodate(bh))
1973                         continue;
1974
1975                 if (!buffer_mapped(bh)) {
1976                         int err = 0;
1977
1978                         fully_mapped = 0;
1979                         if (iblock < lblock) {
1980                                 WARN_ON(bh->b_size != blocksize);
1981                                 err = get_block(inode, iblock, bh, 0);
1982                                 if (err)
1983                                         SetPageError(page);
1984                         }
1985                         if (!buffer_mapped(bh)) {
1986                                 void *kaddr = kmap_atomic(page, KM_USER0);
1987                                 memset(kaddr + i * blocksize, 0, blocksize);
1988                                 flush_dcache_page(page);
1989                                 kunmap_atomic(kaddr, KM_USER0);
1990                                 if (!err)
1991                                         set_buffer_uptodate(bh);
1992                                 continue;
1993                         }
1994                         /*
1995                          * get_block() might have updated the buffer
1996                          * synchronously
1997                          */
1998                         if (buffer_uptodate(bh))
1999                                 continue;
2000                 }
2001                 arr[nr++] = bh;
2002         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2003
2004         if (fully_mapped)
2005                 SetPageMappedToDisk(page);
2006
2007         if (!nr) {
2008                 /*
2009                  * All buffers are uptodate - we can set the page uptodate
2010                  * as well. But not if get_block() returned an error.
2011                  */
2012                 if (!PageError(page))
2013                         SetPageUptodate(page);
2014                 unlock_page(page);
2015                 return 0;
2016         }
2017
2018         /* Stage two: lock the buffers */
2019         for (i = 0; i < nr; i++) {
2020                 bh = arr[i];
2021                 lock_buffer(bh);
2022                 mark_buffer_async_read(bh);
2023         }
2024
2025         /*
2026          * Stage 3: start the IO.  Check for uptodateness
2027          * inside the buffer lock in case another process reading
2028          * the underlying blockdev brought it uptodate (the sct fix).
2029          */
2030         for (i = 0; i < nr; i++) {
2031                 bh = arr[i];
2032                 if (buffer_uptodate(bh))
2033                         end_buffer_async_read(bh, 1);
2034                 else
2035                         submit_bh(READ, bh);
2036         }
2037         return 0;
2038 }
2039
2040 /* utility function for filesystems that need to do work on expanding
2041  * truncates.  Uses prepare/commit_write to allow the filesystem to
2042  * deal with the hole.
2043  */
2044 static int __generic_cont_expand(struct inode *inode, loff_t size,
2045                                  pgoff_t index, unsigned int offset)
2046 {
2047         struct address_space *mapping = inode->i_mapping;
2048         struct page *page;
2049         unsigned long limit;
2050         int err;
2051
2052         err = -EFBIG;
2053         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2054         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2055                 send_sig(SIGXFSZ, current, 0);
2056                 goto out;
2057         }
2058         if (size > inode->i_sb->s_maxbytes)
2059                 goto out;
2060
2061         err = -ENOMEM;
2062         page = grab_cache_page(mapping, index);
2063         if (!page)
2064                 goto out;
2065         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2066         if (err) {
2067                 /*
2068                  * ->prepare_write() may have instantiated a few blocks
2069                  * outside i_size.  Trim these off again.
2070                  */
2071                 unlock_page(page);
2072                 page_cache_release(page);
2073                 vmtruncate(inode, inode->i_size);
2074                 goto out;
2075         }
2076
2077         err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2078
2079         unlock_page(page);
2080         page_cache_release(page);
2081         if (err > 0)
2082                 err = 0;
2083 out:
2084         return err;
2085 }
2086
2087 int generic_cont_expand(struct inode *inode, loff_t size)
2088 {
2089         pgoff_t index;
2090         unsigned int offset;
2091
2092         offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2093
2094         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2095         ** skip the prepare.  make sure we never send an offset for the start
2096         ** of a block
2097         */
2098         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2099                 /* caller must handle this extra byte. */
2100                 offset++;
2101         }
2102         index = size >> PAGE_CACHE_SHIFT;
2103
2104         return __generic_cont_expand(inode, size, index, offset);
2105 }
2106
2107 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2108 {
2109         loff_t pos = size - 1;
2110         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2111         unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2112
2113         /* prepare/commit_write can handle even if from==to==start of block. */
2114         return __generic_cont_expand(inode, size, index, offset);
2115 }
2116
2117 /*
2118  * For moronic filesystems that do not allow holes in file.
2119  * We may have to extend the file.
2120  */
2121
2122 int cont_prepare_write(struct page *page, unsigned offset,
2123                 unsigned to, get_block_t *get_block, loff_t *bytes)
2124 {
2125 #if 0   // mask by Victor Yu. 02-12-2007
2126         struct address_space *mapping = page->mapping;
2127 #else
2128         struct address_space *mapping = page->u.xx.mapping;
2129 #endif
2130         struct inode *inode = mapping->host;
2131         struct page *new_page;
2132         pgoff_t pgpos;
2133         long status;
2134         unsigned zerofrom;
2135         unsigned blocksize = 1 << inode->i_blkbits;
2136         void *kaddr;
2137
2138         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2139                 status = -ENOMEM;
2140                 new_page = grab_cache_page(mapping, pgpos);
2141                 if (!new_page)
2142                         goto out;
2143                 /* we might sleep */
2144                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2145                         unlock_page(new_page);
2146                         page_cache_release(new_page);
2147                         continue;
2148                 }
2149                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2150                 if (zerofrom & (blocksize-1)) {
2151                         *bytes |= (blocksize-1);
2152                         (*bytes)++;
2153                 }
2154                 status = __block_prepare_write(inode, new_page, zerofrom,
2155                                                 PAGE_CACHE_SIZE, get_block);
2156                 if (status)
2157                         goto out_unmap;
2158                 kaddr = kmap_atomic(new_page, KM_USER0);
2159                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2160                 flush_dcache_page(new_page);
2161                 kunmap_atomic(kaddr, KM_USER0);
2162                 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2163                 unlock_page(new_page);
2164                 page_cache_release(new_page);
2165         }
2166
2167         if (page->index < pgpos) {
2168                 /* completely inside the area */
2169                 zerofrom = offset;
2170         } else {
2171                 /* page covers the boundary, find the boundary offset */
2172                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2173
2174                 /* if we will expand the thing last block will be filled */
2175                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2176                         *bytes |= (blocksize-1);
2177                         (*bytes)++;
2178                 }
2179
2180                 /* starting below the boundary? Nothing to zero out */
2181                 if (offset <= zerofrom)
2182                         zerofrom = offset;
2183         }
2184         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2185         if (status)
2186                 goto out1;
2187         if (zerofrom < offset) {
2188                 kaddr = kmap_atomic(page, KM_USER0);
2189                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2190                 flush_dcache_page(page);
2191                 kunmap_atomic(kaddr, KM_USER0);
2192                 __block_commit_write(inode, page, zerofrom, offset);
2193         }
2194         return 0;
2195 out1:
2196         ClearPageUptodate(page);
2197         return status;
2198
2199 out_unmap:
2200         ClearPageUptodate(new_page);
2201         unlock_page(new_page);
2202         page_cache_release(new_page);
2203 out:
2204         return status;
2205 }
2206
2207 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2208                         get_block_t *get_block)
2209 {
2210 #if 0   // mask by Victor Yu. 02-12-2007
2211         struct inode *inode = page->mapping->host;
2212 #else
2213         struct inode *inode = page->u.xx.mapping->host;
2214 #endif
2215         int err = __block_prepare_write(inode, page, from, to, get_block);
2216         if (err)
2217                 ClearPageUptodate(page);
2218         return err;
2219 }
2220
2221 int block_commit_write(struct page *page, unsigned from, unsigned to)
2222 {
2223 #if 0   // mask by Victor Yu. 02-12-2007
2224         struct inode *inode = page->mapping->host;
2225 #else
2226         struct inode *inode = page->u.xx.mapping->host;
2227 #endif
2228         __block_commit_write(inode,page,from,to);
2229         return 0;
2230 }
2231
2232 int generic_commit_write(struct file *file, struct page *page,
2233                 unsigned from, unsigned to)
2234 {
2235 #if 0   // mask by Victor Yu. 02-12-2007
2236         struct inode *inode = page->mapping->host;
2237 #else
2238         struct inode *inode = page->u.xx.mapping->host;
2239 #endif
2240         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2241         __block_commit_write(inode,page,from,to);
2242         /*
2243          * No need to use i_size_read() here, the i_size
2244          * cannot change under us because we hold i_mutex.
2245          */
2246         if (pos > inode->i_size) {
2247                 i_size_write(inode, pos);
2248                 mark_inode_dirty(inode);
2249         }
2250         return 0;
2251 }
2252
2253
2254 /*
2255  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2256  * immediately, while under the page lock.  So it needs a special end_io
2257  * handler which does not touch the bh after unlocking it.
2258  *
2259  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2260  * a race there is benign: unlock_buffer() only use the bh's address for
2261  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2262  * itself.
2263  */
2264 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2265 {
2266         if (uptodate) {
2267                 set_buffer_uptodate(bh);
2268         } else {
2269                 /* This happens, due to failed READA attempts. */
2270                 clear_buffer_uptodate(bh);
2271         }
2272         unlock_buffer(bh);
2273 }
2274
2275 /*
2276  * On entry, the page is fully not uptodate.
2277  * On exit the page is fully uptodate in the areas outside (from,to)
2278  */
2279 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2280                         get_block_t *get_block)
2281 {
2282 #if 0   // mask by Victor Yu. 02-12-2007
2283         struct inode *inode = page->mapping->host;
2284 #else
2285         struct inode *inode = page->u.xx.mapping->host;
2286 #endif
2287         const unsigned blkbits = inode->i_blkbits;
2288         const unsigned blocksize = 1 << blkbits;
2289         struct buffer_head map_bh;
2290         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2291         unsigned block_in_page;
2292         unsigned block_start;
2293         sector_t block_in_file;
2294         char *kaddr;
2295         int nr_reads = 0;
2296         int i;
2297         int ret = 0;
2298         int is_mapped_to_disk = 1;
2299         int dirtied_it = 0;
2300
2301         if (PageMappedToDisk(page))
2302                 return 0;
2303
2304         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2305         map_bh.b_page = page;
2306
2307         /*
2308          * We loop across all blocks in the page, whether or not they are
2309          * part of the affected region.  This is so we can discover if the
2310          * page is fully mapped-to-disk.
2311          */
2312         for (block_start = 0, block_in_page = 0;
2313                   block_start < PAGE_CACHE_SIZE;
2314                   block_in_page++, block_start += blocksize) {
2315                 unsigned block_end = block_start + blocksize;
2316                 int create;
2317
2318                 map_bh.b_state = 0;
2319                 create = 1;
2320                 if (block_start >= to)
2321                         create = 0;
2322                 map_bh.b_size = blocksize;
2323                 ret = get_block(inode, block_in_file + block_in_page,
2324                                         &map_bh, create);
2325                 if (ret)
2326                         goto failed;
2327                 if (!buffer_mapped(&map_bh))
2328                         is_mapped_to_disk = 0;
2329                 if (buffer_new(&map_bh))
2330                         unmap_underlying_metadata(map_bh.b_bdev,
2331                                                         map_bh.b_blocknr);
2332                 if (PageUptodate(page))
2333                         continue;
2334                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2335                         kaddr = kmap_atomic(page, KM_USER0);
2336                         if (block_start < from) {
2337                                 memset(kaddr+block_start, 0, from-block_start);
2338                                 dirtied_it = 1;
2339                         }
2340                         if (block_end > to) {
2341                                 memset(kaddr + to, 0, block_end - to);
2342                                 dirtied_it = 1;
2343                         }
2344                         flush_dcache_page(page);
2345                         kunmap_atomic(kaddr, KM_USER0);
2346                         continue;
2347                 }
2348                 if (buffer_uptodate(&map_bh))
2349                         continue;       /* reiserfs does this */
2350                 if (block_start < from || block_end > to) {
2351                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2352
2353                         if (!bh) {
2354                                 ret = -ENOMEM;
2355                                 goto failed;
2356                         }
2357                         bh->b_state = map_bh.b_state;
2358                         atomic_set(&bh->b_count, 0);
2359                         bh->b_this_page = NULL;
2360                         bh->b_page = page;
2361                         bh->b_blocknr = map_bh.b_blocknr;
2362                         bh->b_size = blocksize;
2363                         bh->b_data = (char *)(long)block_start;
2364                         bh->b_bdev = map_bh.b_bdev;
2365                         bh->b_private = NULL;
2366                         read_bh[nr_reads++] = bh;
2367                 }
2368         }
2369
2370         if (nr_reads) {
2371                 struct buffer_head *bh;
2372
2373                 /*
2374                  * The page is locked, so these buffers are protected from
2375                  * any VM or truncate activity.  Hence we don't need to care
2376                  * for the buffer_head refcounts.
2377                  */
2378                 for (i = 0; i < nr_reads; i++) {
2379                         bh = read_bh[i];
2380                         lock_buffer(bh);
2381                         bh->b_end_io = end_buffer_read_nobh;
2382                         submit_bh(READ, bh);
2383                 }
2384                 for (i = 0; i < nr_reads; i++) {
2385                         bh = read_bh[i];
2386                         wait_on_buffer(bh);
2387                         if (!buffer_uptodate(bh))
2388                                 ret = -EIO;
2389                         free_buffer_head(bh);
2390                         read_bh[i] = NULL;
2391                 }
2392                 if (ret)
2393                         goto failed;
2394         }
2395
2396         if (is_mapped_to_disk)
2397                 SetPageMappedToDisk(page);
2398         SetPageUptodate(page);
2399
2400         /*
2401          * Setting the page dirty here isn't necessary for the prepare_write
2402          * function - commit_write will do that.  But if/when this function is
2403          * used within the pagefault handler to ensure that all mmapped pages
2404          * have backing space in the filesystem, we will need to dirty the page
2405          * if its contents were altered.
2406          */
2407         if (dirtied_it)
2408                 set_page_dirty(page);
2409
2410         return 0;
2411
2412 failed:
2413         for (i = 0; i < nr_reads; i++) {
2414                 if (read_bh[i])
2415                         free_buffer_head(read_bh[i]);
2416         }
2417
2418         /*
2419          * Error recovery is pretty slack.  Clear the page and mark it dirty
2420          * so we'll later zero out any blocks which _were_ allocated.
2421          */
2422         kaddr = kmap_atomic(page, KM_USER0);
2423         memset(kaddr, 0, PAGE_CACHE_SIZE);
2424         flush_dcache_page(page);
2425         kunmap_atomic(kaddr, KM_USER0);
2426         SetPageUptodate(page);
2427         set_page_dirty(page);
2428         return ret;
2429 }
2430 EXPORT_SYMBOL(nobh_prepare_write);
2431
2432 int nobh_commit_write(struct file *file, struct page *page,
2433                 unsigned from, unsigned to)
2434 {
2435 #if 0   // mask by Victor Yu. 02-12-2007
2436         struct inode *inode = page->mapping->host;
2437 #else
2438         struct inode *inode = page->u.xx.mapping->host;
2439 #endif
2440         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2441
2442         set_page_dirty(page);
2443         if (pos > inode->i_size) {
2444                 i_size_write(inode, pos);
2445                 mark_inode_dirty(inode);
2446         }
2447         return 0;
2448 }
2449 EXPORT_SYMBOL(nobh_commit_write);
2450
2451 /*
2452  * nobh_writepage() - based on block_full_write_page() except
2453  * that it tries to operate without attaching bufferheads to
2454  * the page.
2455  */
2456 int nobh_writepage(struct page *page, get_block_t *get_block,
2457                         struct writeback_control *wbc)
2458 {
2459 #if 0   // mask by Victor Yu. 02-12-2007
2460         struct inode * const inode = page->mapping->host;
2461 #else
2462         struct inode * const inode = page->u.xx.mapping->host;
2463 #endif
2464         loff_t i_size = i_size_read(inode);
2465         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2466         unsigned offset;
2467         void *kaddr;
2468         int ret;
2469
2470         /* Is the page fully inside i_size? */
2471         if (page->index < end_index)
2472                 goto out;
2473
2474         /* Is the page fully outside i_size? (truncate in progress) */
2475         offset = i_size & (PAGE_CACHE_SIZE-1);
2476         if (page->index >= end_index+1 || !offset) {
2477                 /*
2478                  * The page may have dirty, unmapped buffers.  For example,
2479                  * they may have been added in ext3_writepage().  Make them
2480                  * freeable here, so the page does not leak.
2481                  */
2482 #if 0
2483                 /* Not really sure about this  - do we need this ? */
2484                 if (page->mapping->a_ops->invalidatepage)
2485                         page->mapping->a_ops->invalidatepage(page, offset);
2486 #endif
2487                 unlock_page(page);
2488                 return 0; /* don't care */
2489         }
2490
2491         /*
2492          * The page straddles i_size.  It must be zeroed out on each and every
2493          * writepage invocation because it may be mmapped.  "A file is mapped
2494          * in multiples of the page size.  For a file that is not a multiple of
2495          * the  page size, the remaining memory is zeroed when mapped, and
2496          * writes to that region are not written out to the file."
2497          */
2498         kaddr = kmap_atomic(page, KM_USER0);
2499         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2500         flush_dcache_page(page);
2501         kunmap_atomic(kaddr, KM_USER0);
2502 out:
2503         ret = mpage_writepage(page, get_block, wbc);
2504         if (ret == -EAGAIN)
2505                 ret = __block_write_full_page(inode, page, get_block, wbc);
2506         return ret;
2507 }
2508 EXPORT_SYMBOL(nobh_writepage);
2509
2510 /*
2511  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2512  */
2513 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2514 {
2515         struct inode *inode = mapping->host;
2516         unsigned blocksize = 1 << inode->i_blkbits;
2517         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2518         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2519         unsigned to;
2520         struct page *page;
2521         const struct address_space_operations *a_ops = mapping->a_ops;
2522         char *kaddr;
2523         int ret = 0;
2524
2525         if ((offset & (blocksize - 1)) == 0)
2526                 goto out;
2527
2528         ret = -ENOMEM;
2529         page = grab_cache_page(mapping, index);
2530         if (!page)
2531                 goto out;
2532
2533         to = (offset + blocksize) & ~(blocksize - 1);
2534         ret = a_ops->prepare_write(NULL, page, offset, to);
2535         if (ret == 0) {
2536                 kaddr = kmap_atomic(page, KM_USER0);
2537                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2538                 flush_dcache_page(page);
2539                 kunmap_atomic(kaddr, KM_USER0);
2540                 set_page_dirty(page);
2541         }
2542         unlock_page(page);
2543         page_cache_release(page);
2544 out:
2545         return ret;
2546 }
2547 EXPORT_SYMBOL(nobh_truncate_page);
2548
2549 int block_truncate_page(struct address_space *mapping,
2550                         loff_t from, get_block_t *get_block)
2551 {
2552         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2553         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2554         unsigned blocksize;
2555         sector_t iblock;
2556         unsigned length, pos;
2557         struct inode *inode = mapping->host;
2558         struct page *page;
2559         struct buffer_head *bh;
2560         void *kaddr;
2561         int err;
2562
2563         blocksize = 1 << inode->i_blkbits;
2564         length = offset & (blocksize - 1);
2565
2566         /* Block boundary? Nothing to do */
2567         if (!length)
2568                 return 0;
2569
2570         length = blocksize - length;
2571         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2572
2573         page = grab_cache_page(mapping, index);
2574         err = -ENOMEM;
2575         if (!page)
2576                 goto out;
2577
2578         if (!page_has_buffers(page))
2579                 create_empty_buffers(page, blocksize, 0);
2580
2581         /* Find the buffer that contains "offset" */
2582         bh = page_buffers(page);
2583         pos = blocksize;
2584         while (offset >= pos) {
2585                 bh = bh->b_this_page;
2586                 iblock++;
2587                 pos += blocksize;
2588         }
2589
2590         err = 0;
2591         if (!buffer_mapped(bh)) {
2592                 WARN_ON(bh->b_size != blocksize);
2593                 err = get_block(inode, iblock, bh, 0);
2594                 if (err)
2595                         goto unlock;
2596                 /* unmapped? It's a hole - nothing to do */
2597                 if (!buffer_mapped(bh))
2598                         goto unlock;
2599         }
2600
2601         /* Ok, it's mapped. Make sure it's up-to-date */
2602         if (PageUptodate(page))
2603                 set_buffer_uptodate(bh);
2604
2605         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2606                 err = -EIO;
2607                 ll_rw_block(READ, 1, &bh);
2608                 wait_on_buffer(bh);
2609                 /* Uhhuh. Read error. Complain and punt. */
2610                 if (!buffer_uptodate(bh))
2611                         goto unlock;
2612         }
2613
2614         kaddr = kmap_atomic(page, KM_USER0);
2615         memset(kaddr + offset, 0, length);
2616         flush_dcache_page(page);
2617         kunmap_atomic(kaddr, KM_USER0);
2618
2619         mark_buffer_dirty(bh);
2620         err = 0;
2621
2622 unlock:
2623         unlock_page(page);
2624         page_cache_release(page);
2625 out:
2626         return err;
2627 }
2628
2629 /*
2630  * The generic ->writepage function for buffer-backed address_spaces
2631  */
2632 int block_write_full_page(struct page *page, get_block_t *get_block,
2633                         struct writeback_control *wbc)
2634 {
2635 #if 0   // mask by Victor Yu. 02-12-2007
2636         struct inode * const inode = page->mapping->host;
2637 #else
2638         struct inode * const inode = page->u.xx.mapping->host;
2639 #endif
2640         loff_t i_size = i_size_read(inode);
2641         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2642         unsigned offset;
2643         void *kaddr;
2644
2645         /* Is the page fully inside i_size? */
2646         if (page->index < end_index)
2647                 return __block_write_full_page(inode, page, get_block, wbc);
2648
2649         /* Is the page fully outside i_size? (truncate in progress) */
2650         offset = i_size & (PAGE_CACHE_SIZE-1);
2651         if (page->index >= end_index+1 || !offset) {
2652                 /*
2653                  * The page may have dirty, unmapped buffers.  For example,
2654                  * they may have been added in ext3_writepage().  Make them
2655                  * freeable here, so the page does not leak.
2656                  */
2657                 do_invalidatepage(page, 0);
2658                 unlock_page(page);
2659                 return 0; /* don't care */
2660         }
2661
2662         /*
2663          * The page straddles i_size.  It must be zeroed out on each and every
2664          * writepage invokation because it may be mmapped.  "A file is mapped
2665          * in multiples of the page size.  For a file that is not a multiple of
2666          * the  page size, the remaining memory is zeroed when mapped, and
2667          * writes to that region are not written out to the file."
2668          */
2669         kaddr = kmap_atomic(page, KM_USER0);
2670         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2671         flush_dcache_page(page);
2672         kunmap_atomic(kaddr, KM_USER0);
2673         return __block_write_full_page(inode, page, get_block, wbc);
2674 }
2675
2676 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2677                             get_block_t *get_block)
2678 {
2679         struct buffer_head tmp;
2680         struct inode *inode = mapping->host;
2681         tmp.b_state = 0;
2682         tmp.b_blocknr = 0;
2683         tmp.b_size = 1 << inode->i_blkbits;
2684         get_block(inode, block, &tmp, 0);
2685         return tmp.b_blocknr;
2686 }
2687
2688 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2689 {
2690         struct buffer_head *bh = bio->bi_private;
2691
2692         if (bio->bi_size)
2693                 return 1;
2694
2695         if (err == -EOPNOTSUPP) {
2696                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2697                 set_bit(BH_Eopnotsupp, &bh->b_state);
2698         }
2699
2700         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2701         bio_put(bio);
2702         return 0;
2703 }
2704
2705 int submit_bh(int rw, struct buffer_head * bh)
2706 {
2707         struct bio *bio;
2708         int ret = 0;
2709
2710         BUG_ON(!buffer_locked(bh));
2711         BUG_ON(!buffer_mapped(bh));
2712         BUG_ON(!bh->b_end_io);
2713
2714         if (buffer_ordered(bh) && (rw == WRITE))
2715                 rw = WRITE_BARRIER;
2716
2717         /*
2718          * Only clear out a write error when rewriting, should this
2719          * include WRITE_SYNC as well?
2720          */
2721         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2722                 clear_buffer_write_io_error(bh);
2723
2724         /*
2725          * from here on down, it's all bio -- do the initial mapping,
2726          * submit_bio -> generic_make_request may further map this bio around
2727          */
2728         bio = bio_alloc(GFP_NOIO, 1);
2729
2730         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2731         bio->bi_bdev = bh->b_bdev;
2732         bio->bi_io_vec[0].bv_page = bh->b_page;
2733         bio->bi_io_vec[0].bv_len = bh->b_size;
2734         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2735
2736         bio->bi_vcnt = 1;
2737         bio->bi_idx = 0;
2738         bio->bi_size = bh->b_size;
2739
2740         bio->bi_end_io = end_bio_bh_io_sync;
2741         bio->bi_private = bh;
2742
2743         bio_get(bio);
2744         submit_bio(rw, bio);
2745
2746         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2747                 ret = -EOPNOTSUPP;
2748
2749         bio_put(bio);
2750         return ret;
2751 }
2752
2753 /**
2754  * ll_rw_block: low-level access to block devices (DEPRECATED)
2755  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2756  * @nr: number of &struct buffer_heads in the array
2757  * @bhs: array of pointers to &struct buffer_head
2758  *
2759  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2760  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2761  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2762  * are sent to disk. The fourth %READA option is described in the documentation
2763  * for generic_make_request() which ll_rw_block() calls.
2764  *
2765  * This function drops any buffer that it cannot get a lock on (with the
2766  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2767  * clean when doing a write request, and any buffer that appears to be
2768  * up-to-date when doing read request.  Further it marks as clean buffers that
2769  * are processed for writing (the buffer cache won't assume that they are
2770  * actually clean until the buffer gets unlocked).
2771  *
2772  * ll_rw_block sets b_end_io to simple completion handler that marks
2773  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2774  * any waiters.
2775  *
2776  * All of the buffers must be for the same device, and must also be a
2777  * multiple of the current approved size for the device.
2778  */
2779 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2780 {
2781         int i;
2782
2783         for (i = 0; i < nr; i++) {
2784                 struct buffer_head *bh = bhs[i];
2785
2786                 if (rw == SWRITE)
2787                         lock_buffer(bh);
2788                 else if (test_set_buffer_locked(bh))
2789                         continue;
2790
2791                 if (rw == WRITE || rw == SWRITE) {
2792                         if (test_clear_buffer_dirty(bh)) {
2793                                 bh->b_end_io = end_buffer_write_sync;
2794                                 get_bh(bh);
2795                                 submit_bh(WRITE, bh);
2796                                 continue;
2797                         }
2798                 } else {
2799                         if (!buffer_uptodate(bh)) {
2800                                 bh->b_end_io = end_buffer_read_sync;
2801                                 get_bh(bh);
2802                                 submit_bh(rw, bh);
2803                                 continue;
2804                         }
2805                 }
2806                 unlock_buffer(bh);
2807         }
2808 }
2809
2810 /*
2811  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2812  * and then start new I/O and then wait upon it.  The caller must have a ref on
2813  * the buffer_head.
2814  */
2815 int sync_dirty_buffer(struct buffer_head *bh)
2816 {
2817         int ret = 0;
2818
2819         WARN_ON(atomic_read(&bh->b_count) < 1);
2820         lock_buffer(bh);
2821         if (test_clear_buffer_dirty(bh)) {
2822                 get_bh(bh);
2823                 bh->b_end_io = end_buffer_write_sync;
2824                 ret = submit_bh(WRITE, bh);
2825                 wait_on_buffer(bh);
2826                 if (buffer_eopnotsupp(bh)) {
2827                         clear_buffer_eopnotsupp(bh);
2828                         ret = -EOPNOTSUPP;
2829                 }
2830                 if (!ret && !buffer_uptodate(bh))
2831                         ret = -EIO;
2832         } else {
2833                 unlock_buffer(bh);
2834         }
2835         return ret;
2836 }
2837
2838 /*
2839  * try_to_free_buffers() checks if all the buffers on this particular page
2840  * are unused, and releases them if so.
2841  *
2842  * Exclusion against try_to_free_buffers may be obtained by either
2843  * locking the page or by holding its mapping's private_lock.
2844  *
2845  * If the page is dirty but all the buffers are clean then we need to
2846  * be sure to mark the page clean as well.  This is because the page
2847  * may be against a block device, and a later reattachment of buffers
2848  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2849  * filesystem data on the same device.
2850  *
2851  * The same applies to regular filesystem pages: if all the buffers are
2852  * clean then we set the page clean and proceed.  To do that, we require
2853  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2854  * private_lock.
2855  *
2856  * try_to_free_buffers() is non-blocking.
2857  */
2858 static inline int buffer_busy(struct buffer_head *bh)
2859 {
2860         return atomic_read(&bh->b_count) |
2861                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2862 }
2863
2864 static int
2865 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2866 {
2867         struct buffer_head *head = page_buffers(page);
2868         struct buffer_head *bh;
2869
2870         bh = head;
2871         do {
2872 #if 0   // mask by Victor Yu. 02-12-2007
2873                 if (buffer_write_io_error(bh) && page->mapping)
2874                         set_bit(AS_EIO, &page->mapping->flags);
2875 #else
2876                 if (buffer_write_io_error(bh) && page->u.xx.mapping)
2877                         set_bit(AS_EIO, &page->u.xx.mapping->flags);
2878 #endif
2879                 if (buffer_busy(bh))
2880                         goto failed;
2881                 bh = bh->b_this_page;
2882         } while (bh != head);
2883
2884         do {
2885                 struct buffer_head *next = bh->b_this_page;
2886
2887                 if (!list_empty(&bh->b_assoc_buffers))
2888                         __remove_assoc_queue(bh);
2889                 bh = next;
2890         } while (bh != head);
2891         *buffers_to_free = head;
2892         __clear_page_buffers(page);
2893         return 1;
2894 failed:
2895         return 0;
2896 }
2897
2898 int try_to_free_buffers(struct page *page)
2899 {
2900 #if 0   // mask by Victor Yu. 02-12-2007
2901         struct address_space * const mapping = page->mapping;
2902 #else
2903         struct address_space * const mapping = page->u.xx.mapping;
2904 #endif
2905         struct buffer_head *buffers_to_free = NULL;
2906         int ret = 0;
2907
2908         BUG_ON(!PageLocked(page));
2909         if (PageWriteback(page))
2910                 return 0;
2911
2912         if (mapping == NULL) {          /* can this still happen? */
2913                 ret = drop_buffers(page, &buffers_to_free);
2914                 goto out;
2915         }
2916
2917         spin_lock(&mapping->private_lock);
2918         ret = drop_buffers(page, &buffers_to_free);
2919         spin_unlock(&mapping->private_lock);
2920         if (ret) {
2921                 /*
2922                  * If the filesystem writes its buffers by hand (eg ext3)
2923                  * then we can have clean buffers against a dirty page.  We
2924                  * clean the page here; otherwise later reattachment of buffers
2925                  * could encounter a non-uptodate page, which is unresolvable.
2926                  * This only applies in the rare case where try_to_free_buffers
2927                  * succeeds but the page is not freed.
2928                  */
2929                 clear_page_dirty(page);
2930         }
2931 out:
2932         if (buffers_to_free) {
2933                 struct buffer_head *bh = buffers_to_free;
2934
2935                 do {
2936                         struct buffer_head *next = bh->b_this_page;
2937                         free_buffer_head(bh);
2938                         bh = next;
2939                 } while (bh != buffers_to_free);
2940         }
2941         return ret;
2942 }
2943 EXPORT_SYMBOL(try_to_free_buffers);
2944
2945 void block_sync_page(struct page *page)
2946 {
2947         struct address_space *mapping;
2948
2949         smp_mb();
2950         mapping = page_mapping(page);
2951         if (mapping)
2952                 blk_run_backing_dev(mapping->backing_dev_info, page);
2953 }
2954
2955 /*
2956  * There are no bdflush tunables left.  But distributions are
2957  * still running obsolete flush daemons, so we terminate them here.
2958  *
2959  * Use of bdflush() is deprecated and will be removed in a future kernel.
2960  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2961  */
2962 asmlinkage long sys_bdflush(int func, long data)
2963 {
2964         static int msg_count;
2965
2966         if (!capable(CAP_SYS_ADMIN))
2967                 return -EPERM;
2968
2969         if (msg_count < 5) {
2970                 msg_count++;
2971                 printk(KERN_INFO
2972                         "warning: process `%s' used the obsolete bdflush"
2973                         " system call\n", current->comm);
2974                 printk(KERN_INFO "Fix your initscripts?\n");
2975         }
2976
2977         if (func == 1)
2978                 do_exit(0);
2979         return 0;
2980 }
2981
2982 /*
2983  * Buffer-head allocation
2984  */
2985 static kmem_cache_t *bh_cachep;
2986
2987 /*
2988  * Once the number of bh's in the machine exceeds this level, we start
2989  * stripping them in writeback.
2990  */
2991 static int max_buffer_heads;
2992
2993 int buffer_heads_over_limit;
2994
2995 struct bh_accounting {
2996         int nr;                 /* Number of live bh's */
2997         int ratelimit;          /* Limit cacheline bouncing */
2998 };
2999
3000 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3001
3002 static void recalc_bh_state(void)
3003 {
3004         int i;
3005         int tot = 0;
3006
3007         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3008                 return;
3009         __get_cpu_var(bh_accounting).ratelimit = 0;
3010         for_each_online_cpu(i)
3011                 tot += per_cpu(bh_accounting, i).nr;
3012         buffer_heads_over_limit = (tot > max_buffer_heads);
3013 }
3014
3015 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3016 {
3017         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3018         if (ret) {
3019                 get_cpu_var(bh_accounting).nr++;
3020                 recalc_bh_state();
3021                 put_cpu_var(bh_accounting);
3022         }
3023         return ret;
3024 }
3025 EXPORT_SYMBOL(alloc_buffer_head);
3026
3027 void free_buffer_head(struct buffer_head *bh)
3028 {
3029         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3030         kmem_cache_free(bh_cachep, bh);
3031         get_cpu_var(bh_accounting).nr--;
3032         recalc_bh_state();
3033         put_cpu_var(bh_accounting);
3034 }
3035 EXPORT_SYMBOL(free_buffer_head);
3036
3037 static void
3038 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3039 {
3040         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3041                             SLAB_CTOR_CONSTRUCTOR) {
3042                 struct buffer_head * bh = (struct buffer_head *)data;
3043
3044                 memset(bh, 0, sizeof(*bh));
3045                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3046         }
3047 }
3048
3049 #ifdef CONFIG_HOTPLUG_CPU
3050 static void buffer_exit_cpu(int cpu)
3051 {
3052         int i;
3053         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3054
3055         for (i = 0; i < BH_LRU_SIZE; i++) {
3056                 brelse(b->bhs[i]);
3057                 b->bhs[i] = NULL;
3058         }
3059         get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3060         per_cpu(bh_accounting, cpu).nr = 0;
3061         put_cpu_var(bh_accounting);
3062 }
3063
3064 static int buffer_cpu_notify(struct notifier_block *self,
3065                               unsigned long action, void *hcpu)
3066 {
3067         if (action == CPU_DEAD)
3068                 buffer_exit_cpu((unsigned long)hcpu);
3069         return NOTIFY_OK;
3070 }
3071 #endif /* CONFIG_HOTPLUG_CPU */
3072
3073 void __init buffer_init(void)
3074 {
3075         int nrpages;
3076
3077         bh_cachep = kmem_cache_create("buffer_head",
3078                                         sizeof(struct buffer_head), 0,
3079                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3080                                         SLAB_MEM_SPREAD),
3081                                         init_buffer_head,
3082                                         NULL);
3083
3084         /*
3085          * Limit the bh occupancy to 10% of ZONE_NORMAL
3086          */
3087         nrpages = (nr_free_buffer_pages() * 10) / 100;
3088         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3089         hotcpu_notifier(buffer_cpu_notify, 0);
3090 }
3091
3092 EXPORT_SYMBOL(__bforget);
3093 EXPORT_SYMBOL(__brelse);
3094 EXPORT_SYMBOL(__wait_on_buffer);
3095 EXPORT_SYMBOL(block_commit_write);
3096 EXPORT_SYMBOL(block_prepare_write);
3097 EXPORT_SYMBOL(block_read_full_page);
3098 EXPORT_SYMBOL(block_sync_page);
3099 EXPORT_SYMBOL(block_truncate_page);
3100 EXPORT_SYMBOL(block_write_full_page);
3101 EXPORT_SYMBOL(cont_prepare_write);
3102 EXPORT_SYMBOL(end_buffer_read_sync);
3103 EXPORT_SYMBOL(end_buffer_write_sync);
3104 EXPORT_SYMBOL(file_fsync);
3105 EXPORT_SYMBOL(fsync_bdev);
3106 EXPORT_SYMBOL(generic_block_bmap);
3107 EXPORT_SYMBOL(generic_commit_write);
3108 EXPORT_SYMBOL(generic_cont_expand);
3109 EXPORT_SYMBOL(generic_cont_expand_simple);
3110 EXPORT_SYMBOL(init_buffer);
3111 EXPORT_SYMBOL(invalidate_bdev);
3112 EXPORT_SYMBOL(ll_rw_block);
3113 EXPORT_SYMBOL(mark_buffer_dirty);
3114 EXPORT_SYMBOL(submit_bh);
3115 EXPORT_SYMBOL(sync_dirty_buffer);
3116 EXPORT_SYMBOL(unlock_buffer);