release/src-rt/linux/linux-2.6/fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5  */
   6
   7 /*
   8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9  *
  10  * Removed a lot of unnecessary code and simplified things now that
  11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12  *
  13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15  *
  16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17  *
  18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19  */
  20
  21 #include <linux/kernel.h>
  22 #include <linux/syscalls.h>
  23 #include <linux/fs.h>
  24 #include <linux/mm.h>
  25 #include <linux/percpu.h>
  26 #include <linux/slab.h>
  27 #include <linux/capability.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/file.h>
  30 #include <linux/quotaops.h>
  31 #include <linux/highmem.h>
  32 #include <linux/module.h>
  33 #include <linux/writeback.h>
  34 #include <linux/hash.h>
  35 #include <linux/suspend.h>
  36 #include <linux/buffer_head.h>
  37 #include <linux/task_io_accounting_ops.h>
  38 #include <linux/bio.h>
  39 #include <linux/notifier.h>
  40 #include <linux/cpu.h>
  41 #include <linux/bitops.h>
  42 #include <linux/mpage.h>
  43 #include <linux/bit_spinlock.h>
  44
  45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  46
  47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  48
  49 inline void
  50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  51 {
  52         bh->b_end_io = handler;
  53         bh->b_private = private;
  54 }
  55
  56 static int sync_buffer(void *word)
  57 {
  58         struct block_device *bd;
  59         struct buffer_head *bh
  60                 = container_of(word, struct buffer_head, b_state);
  61
  62         smp_mb();
  63         bd = bh->b_bdev;
  64         if (bd)
  65                 blk_run_address_space(bd->bd_inode->i_mapping);
  66         io_schedule();
  67         return 0;
  68 }
  69
  70 void fastcall __lock_buffer(struct buffer_head *bh)
  71 {
  72         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  73                                                         TASK_UNINTERRUPTIBLE);
  74 }
  75 EXPORT_SYMBOL(__lock_buffer);
  76
  77 void fastcall unlock_buffer(struct buffer_head *bh)
  78 {
  79         smp_mb__before_clear_bit();
  80         clear_buffer_locked(bh);
  81         smp_mb__after_clear_bit();
  82         wake_up_bit(&bh->b_state, BH_Lock);
  83 }
  84
  85 /*
  86  * Block until a buffer comes unlocked.  This doesn't stop it
  87  * from becoming locked again - you have to lock it yourself
  88  * if you want to preserve its state.
  89  */
  90 void __wait_on_buffer(struct buffer_head * bh)
  91 {
  92         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  93 }
  94
  95 static void
  96 __clear_page_buffers(struct page *page)
  97 {
  98         ClearPagePrivate(page);
  99         set_page_private(page, 0);
 100         page_cache_release(page);
 101 }
 102
 103 static void buffer_io_error(struct buffer_head *bh)
 104 {
 105         char b[BDEVNAME_SIZE];
 106
 107         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 108                         bdevname(bh->b_bdev, b),
 109                         (unsigned long long)bh->b_blocknr);
 110 }
 111
 112 /*
 113  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 114  * unlock the buffer. This is what ll_rw_block uses too.
 115  */
 116 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 117 {
 118         if (uptodate) {
 119                 set_buffer_uptodate(bh);
 120         } else {
 121                 /* This happens, due to failed READA attempts. */
 122                 clear_buffer_uptodate(bh);
 123         }
 124         unlock_buffer(bh);
 125         put_bh(bh);
 126 }
 127
 128 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 129 {
 130         char b[BDEVNAME_SIZE];
 131
 132         if (uptodate) {
 133                 set_buffer_uptodate(bh);
 134         } else {
 135                 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
 136                         buffer_io_error(bh);
 137                         printk(KERN_WARNING "lost page write due to "
 138                                         "I/O error on %s\n",
 139                                        bdevname(bh->b_bdev, b));
 140                 }
 141                 set_buffer_write_io_error(bh);
 142                 clear_buffer_uptodate(bh);
 143         }
 144         unlock_buffer(bh);
 145         put_bh(bh);
 146 }
 147
 148 /*
 149  * Write out and wait upon all the dirty data associated with a block
 150  * device via its mapping.  Does not take the superblock lock.
 151  */
 152 int sync_blockdev(struct block_device *bdev)
 153 {
 154         int ret = 0;
 155
 156         if (bdev)
 157                 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 158         return ret;
 159 }
 160 EXPORT_SYMBOL(sync_blockdev);
 161
 162 /*
 163  * Write out and wait upon all dirty data associated with this
 164  * device.   Filesystem data as well as the underlying block
 165  * device.  Takes the superblock lock.
 166  */
 167 int fsync_bdev(struct block_device *bdev)
 168 {
 169         struct super_block *sb = get_super(bdev);
 170         if (sb) {
 171                 int res = fsync_super(sb);
 172                 drop_super(sb);
 173                 return res;
 174         }
 175         return sync_blockdev(bdev);
 176 }
 177
 178 /**
 179  * freeze_bdev  --  lock a filesystem and force it into a consistent state
 180  * @bdev:       blockdevice to lock
 181  *
 182  * This takes the block device bd_mount_sem to make sure no new mounts
 183  * happen on bdev until thaw_bdev() is called.
 184  * If a superblock is found on this device, we take the s_umount semaphore
 185  * on it to make sure nobody unmounts until the snapshot creation is done.
 186  */
 187 struct super_block *freeze_bdev(struct block_device *bdev)
 188 {
 189         struct super_block *sb;
 190
 191         down(&bdev->bd_mount_sem);
 192         sb = get_super(bdev);
 193         if (sb && !(sb->s_flags & MS_RDONLY)) {
 194                 sb->s_frozen = SB_FREEZE_WRITE;
 195                 smp_wmb();
 196
 197                 __fsync_super(sb);
 198
 199                 sb->s_frozen = SB_FREEZE_TRANS;
 200                 smp_wmb();
 201
 202                 sync_blockdev(sb->s_bdev);
 203
 204                 if (sb->s_op->write_super_lockfs)
 205                         sb->s_op->write_super_lockfs(sb);
 206         }
 207
 208         sync_blockdev(bdev);
 209         return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 210 }
 211 EXPORT_SYMBOL(freeze_bdev);
 212
 213 /**
 214  * thaw_bdev  -- unlock filesystem
 215  * @bdev:       blockdevice to unlock
 216  * @sb:         associated superblock
 217  *
 218  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 219  */
 220 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 221 {
 222         if (sb) {
 223                 BUG_ON(sb->s_bdev != bdev);
 224
 225                 if (sb->s_op->unlockfs)
 226                         sb->s_op->unlockfs(sb);
 227                 sb->s_frozen = SB_UNFROZEN;
 228                 smp_wmb();
 229                 wake_up(&sb->s_wait_unfrozen);
 230                 drop_super(sb);
 231         }
 232
 233         up(&bdev->bd_mount_sem);
 234 }
 235 EXPORT_SYMBOL(thaw_bdev);
 236
 237 /*
 238  * Various filesystems appear to want __find_get_block to be non-blocking.
 239  * But it's the page lock which protects the buffers.  To get around this,
 240  * we get exclusion from try_to_free_buffers with the blockdev mapping's
 241  * private_lock.
 242  *
 243  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 244  * may be quite high.  This code could TryLock the page, and if that
 245  * succeeds, there is no need to take private_lock. (But if
 246  * private_lock is contended then so is mapping->tree_lock).
 247  */
 248 static struct buffer_head *
 249 __find_get_block_slow(struct block_device *bdev, sector_t block)
 250 {
 251         struct inode *bd_inode = bdev->bd_inode;
 252         struct address_space *bd_mapping = bd_inode->i_mapping;
 253         struct buffer_head *ret = NULL;
 254         pgoff_t index;
 255         struct buffer_head *bh;
 256         struct buffer_head *head;
 257         struct page *page;
 258         int all_mapped = 1;
 259
 260         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 261         page = find_get_page(bd_mapping, index);
 262         if (!page)
 263                 goto out;
 264
 265         spin_lock(&bd_mapping->private_lock);
 266         if (!page_has_buffers(page))
 267                 goto out_unlock;
 268         head = page_buffers(page);
 269         bh = head;
 270         do {
 271                 if (bh->b_blocknr == block) {
 272                         ret = bh;
 273                         get_bh(bh);
 274                         goto out_unlock;
 275                 }
 276                 if (!buffer_mapped(bh))
 277                         all_mapped = 0;
 278                 bh = bh->b_this_page;
 279         } while (bh != head);
 280
 281         /* we might be here because some of the buffers on this page are
 282          * not mapped.  This is due to various races between
 283          * file io on the block device and getblk.  It gets dealt with
 284          * elsewhere, don't buffer_error if we had some unmapped buffers
 285          */
 286         if (all_mapped) {
 287                 printk("__find_get_block_slow() failed. "
 288                         "block=%llu, b_blocknr=%llu\n",
 289                         (unsigned long long)block,
 290                         (unsigned long long)bh->b_blocknr);
 291                 printk("b_state=0x%08lx, b_size=%zu\n",
 292                         bh->b_state, bh->b_size);
 293                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 294         }
 295 out_unlock:
 296         spin_unlock(&bd_mapping->private_lock);
 297         page_cache_release(page);
 298 out:
 299         return ret;
 300 }
 301
 302 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 303    of fs corruption is going on. Trashing dirty data always imply losing
 304    information that was supposed to be just stored on the physical layer
 305    by the user.
 306
 307    Thus invalidate_buffers in general usage is not allwowed to trash
 308    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 309    be preserved.  These buffers are simply skipped.
 310
 311    We also skip buffers which are still in use.  For example this can
 312    happen if a userspace program is reading the block device.
 313
 314    NOTE: In the case where the user removed a removable-media-disk even if
 315    there's still dirty data not synced on disk (due a bug in the device driver
 316    or due an error of the user), by not destroying the dirty buffers we could
 317    generate corruption also on the next media inserted, thus a parameter is
 318    necessary to handle this case in the most safe way possible (trying
 319    to not corrupt also the new disk inserted with the data belonging to
 320    the old now corrupted disk). Also for the ramdisk the natural thing
 321    to do in order to release the ramdisk memory is to destroy dirty buffers.
 322
 323    These are two special cases. Normal usage imply the device driver
 324    to issue a sync on the device (without waiting I/O completion) and
 325    then an invalidate_buffers call that doesn't trash dirty buffers.
 326
 327    For handling cache coherency with the blkdev pagecache the 'update' case
 328    is been introduced. It is needed to re-read from disk any pinned
 329    buffer. NOTE: re-reading from disk is destructive so we can do it only
 330    when we assume nobody is changing the buffercache under our I/O and when
 331    we think the disk contains more recent information than the buffercache.
 332    The update == 1 pass marks the buffers we need to update, the update == 2
 333    pass does the actual I/O. */
 334 void invalidate_bdev(struct block_device *bdev)
 335 {
 336         struct address_space *mapping = bdev->bd_inode->i_mapping;
 337
 338         if (mapping->nrpages == 0)
 339                 return;
 340
 341         invalidate_bh_lrus();
 342         invalidate_mapping_pages(mapping, 0, -1);
 343 }
 344
 345 /*
 346  * Kick pdflush then try to free up some ZONE_NORMAL memory.
 347  */
 348 static void free_more_memory(void)
 349 {
 350         struct zone **zones;
 351         pg_data_t *pgdat;
 352
 353         wakeup_pdflush(1024);
 354         yield();
 355
 356         for_each_online_pgdat(pgdat) {
 357                 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 358                 if (*zones)
 359                         try_to_free_pages(zones, GFP_NOFS);
 360         }
 361 }
 362
 363 /*
 364  * I/O completion handler for block_read_full_page() - pages
 365  * which come unlocked at the end of I/O.
 366  */
 367 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 368 {
 369         unsigned long flags;
 370         struct buffer_head *first;
 371         struct buffer_head *tmp;
 372         struct page *page;
 373         int page_uptodate = 1;
 374
 375         BUG_ON(!buffer_async_read(bh));
 376
 377         page = bh->b_page;
 378         if (uptodate) {
 379                 set_buffer_uptodate(bh);
 380         } else {
 381                 clear_buffer_uptodate(bh);
 382                 if (printk_ratelimit())
 383                         buffer_io_error(bh);
 384                 SetPageError(page);
 385         }
 386
 387         /*
 388          * Be _very_ careful from here on. Bad things can happen if
 389          * two buffer heads end IO at almost the same time and both
 390          * decide that the page is now completely done.
 391          */
 392         first = page_buffers(page);
 393         local_irq_save(flags);
 394         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 395         clear_buffer_async_read(bh);
 396         unlock_buffer(bh);
 397         tmp = bh;
 398         do {
 399                 if (!buffer_uptodate(tmp))
 400                         page_uptodate = 0;
 401                 if (buffer_async_read(tmp)) {
 402                         BUG_ON(!buffer_locked(tmp));
 403                         goto still_busy;
 404                 }
 405                 tmp = tmp->b_this_page;
 406         } while (tmp != bh);
 407         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 408         local_irq_restore(flags);
 409
 410         /*
 411          * If none of the buffers had errors and they are all
 412          * uptodate then we can set the page uptodate.
 413          */
 414         if (page_uptodate && !PageError(page))
 415                 SetPageUptodate(page);
 416         unlock_page(page);
 417         return;
 418
 419 still_busy:
 420         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 421         local_irq_restore(flags);
 422         return;
 423 }
 424
 425 /*
 426  * Completion handler for block_write_full_page() - pages which are unlocked
 427  * during I/O, and which have PageWriteback cleared upon I/O completion.
 428  */
 429 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 430 {
 431         char b[BDEVNAME_SIZE];
 432         unsigned long flags;
 433         struct buffer_head *first;
 434         struct buffer_head *tmp;
 435         struct page *page;
 436
 437         BUG_ON(!buffer_async_write(bh));
 438
 439         page = bh->b_page;
 440         if (uptodate) {
 441                 set_buffer_uptodate(bh);
 442         } else {
 443                 if (printk_ratelimit()) {
 444                         buffer_io_error(bh);
 445                         printk(KERN_WARNING "lost page write due to "
 446                                         "I/O error on %s\n",
 447                                bdevname(bh->b_bdev, b));
 448                 }
 449                 set_bit(AS_EIO, &page->mapping->flags);
 450                 set_buffer_write_io_error(bh);
 451                 clear_buffer_uptodate(bh);
 452                 SetPageError(page);
 453         }
 454
 455         first = page_buffers(page);
 456         local_irq_save(flags);
 457         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 458
 459         clear_buffer_async_write(bh);
 460         unlock_buffer(bh);
 461         tmp = bh->b_this_page;
 462         while (tmp != bh) {
 463                 if (buffer_async_write(tmp)) {
 464                         BUG_ON(!buffer_locked(tmp));
 465                         goto still_busy;
 466                 }
 467                 tmp = tmp->b_this_page;
 468         }
 469         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 470         local_irq_restore(flags);
 471         end_page_writeback(page);
 472         return;
 473
 474 still_busy:
 475         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 476         local_irq_restore(flags);
 477         return;
 478 }
 479
 480 /*
 481  * If a page's buffers are under async readin (end_buffer_async_read
 482  * completion) then there is a possibility that another thread of
 483  * control could lock one of the buffers after it has completed
 484  * but while some of the other buffers have not completed.  This
 485  * locked buffer would confuse end_buffer_async_read() into not unlocking
 486  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 487  * that this buffer is not under async I/O.
 488  *
 489  * The page comes unlocked when it has no locked buffer_async buffers
 490  * left.
 491  *
 492  * PageLocked prevents anyone starting new async I/O reads any of
 493  * the buffers.
 494  *
 495  * PageWriteback is used to prevent simultaneous writeout of the same
 496  * page.
 497  *
 498  * PageLocked prevents anyone from starting writeback of a page which is
 499  * under read I/O (PageWriteback is only ever set against a locked page).
 500  */
 501 static void mark_buffer_async_read(struct buffer_head *bh)
 502 {
 503         bh->b_end_io = end_buffer_async_read;
 504         set_buffer_async_read(bh);
 505 }
 506
 507 void mark_buffer_async_write(struct buffer_head *bh)
 508 {
 509         bh->b_end_io = end_buffer_async_write;
 510         set_buffer_async_write(bh);
 511 }
 512 EXPORT_SYMBOL(mark_buffer_async_write);
 513
 514
 515 /*
 516  * fs/buffer.c contains helper functions for buffer-backed address space's
 517  * fsync functions.  A common requirement for buffer-based filesystems is
 518  * that certain data from the backing blockdev needs to be written out for
 519  * a successful fsync().  For example, ext2 indirect blocks need to be
 520  * written back and waited upon before fsync() returns.
 521  *
 522  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 523  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 524  * management of a list of dependent buffers at ->i_mapping->private_list.
 525  *
 526  * Locking is a little subtle: try_to_free_buffers() will remove buffers
 527  * from their controlling inode's queue when they are being freed.  But
 528  * try_to_free_buffers() will be operating against the *blockdev* mapping
 529  * at the time, not against the S_ISREG file which depends on those buffers.
 530  * So the locking for private_list is via the private_lock in the address_space
 531  * which backs the buffers.  Which is different from the address_space
 532  * against which the buffers are listed.  So for a particular address_space,
 533  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 534  * mapping->private_list will always be protected by the backing blockdev's
 535  * ->private_lock.
 536  *
 537  * Which introduces a requirement: all buffers on an address_space's
 538  * ->private_list must be from the same address_space: the blockdev's.
 539  *
 540  * address_spaces which do not place buffers at ->private_list via these
 541  * utility functions are free to use private_lock and private_list for
 542  * whatever they want.  The only requirement is that list_empty(private_list)
 543  * be true at clear_inode() time.
 544  *
 545  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 546  * filesystems should do that.  invalidate_inode_buffers() should just go
 547  * BUG_ON(!list_empty).
 548  *
 549  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 550  * take an address_space, not an inode.  And it should be called
 551  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 552  * queued up.
 553  *
 554  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 555  * list if it is already on a list.  Because if the buffer is on a list,
 556  * it *must* already be on the right one.  If not, the filesystem is being
 557  * silly.  This will save a ton of locking.  But first we have to ensure
 558  * that buffers are taken *off* the old inode's list when they are freed
 559  * (presumably in truncate).  That requires careful auditing of all
 560  * filesystems (do it inside bforget()).  It could also be done by bringing
 561  * b_inode back.
 562  */
 563
 564 /*
 565  * The buffer's backing address_space's private_lock must be held
 566  */
 567 static inline void __remove_assoc_queue(struct buffer_head *bh)
 568 {
 569         list_del_init(&bh->b_assoc_buffers);
 570         WARN_ON(!bh->b_assoc_map);
 571         if (buffer_write_io_error(bh))
 572                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
 573         bh->b_assoc_map = NULL;
 574 }
 575
 576 int inode_has_buffers(struct inode *inode)
 577 {
 578         return !list_empty(&inode->i_data.private_list);
 579 }
 580
 581 /*
 582  * osync is designed to support O_SYNC io.  It waits synchronously for
 583  * all already-submitted IO to complete, but does not queue any new
 584  * writes to the disk.
 585  *
 586  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 587  * you dirty the buffers, and then use osync_inode_buffers to wait for
 588  * completion.  Any other dirty buffers which are not yet queued for
 589  * write will not be flushed to disk by the osync.
 590  */
 591 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 592 {
 593         struct buffer_head *bh;
 594         struct list_head *p;
 595         int err = 0;
 596
 597         spin_lock(lock);
 598 repeat:
 599         list_for_each_prev(p, list) {
 600                 bh = BH_ENTRY(p);
 601                 if (buffer_locked(bh)) {
 602                         get_bh(bh);
 603                         spin_unlock(lock);
 604                         wait_on_buffer(bh);
 605                         if (!buffer_uptodate(bh))
 606                                 err = -EIO;
 607                         brelse(bh);
 608                         spin_lock(lock);
 609                         goto repeat;
 610                 }
 611         }
 612         spin_unlock(lock);
 613         return err;
 614 }
 615
 616 /**
 617  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 618  *                        buffers
 619  * @mapping: the mapping which wants those buffers written
 620  *
 621  * Starts I/O against the buffers at mapping->private_list, and waits upon
 622  * that I/O.
 623  *
 624  * Basically, this is a convenience function for fsync().
 625  * @mapping is a file or directory which needs those buffers to be written for
 626  * a successful fsync().
 627  */
 628 int sync_mapping_buffers(struct address_space *mapping)
 629 {
 630         struct address_space *buffer_mapping = mapping->assoc_mapping;
 631
 632         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 633                 return 0;
 634
 635         return fsync_buffers_list(&buffer_mapping->private_lock,
 636                                         &mapping->private_list);
 637 }
 638 EXPORT_SYMBOL(sync_mapping_buffers);
 639
 640 /*
 641  * Called when we've recently written block `bblock', and it is known that
 642  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 643  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 644  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 645  */
 646 void write_boundary_block(struct block_device *bdev,
 647                         sector_t bblock, unsigned blocksize)
 648 {
 649         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 650         if (bh) {
 651                 if (buffer_dirty(bh))
 652                         ll_rw_block(WRITE, 1, &bh);
 653                 put_bh(bh);
 654         }
 655 }
 656
 657 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 658 {
 659         struct address_space *mapping = inode->i_mapping;
 660         struct address_space *buffer_mapping = bh->b_page->mapping;
 661
 662         mark_buffer_dirty(bh);
 663         if (!mapping->assoc_mapping) {
 664                 mapping->assoc_mapping = buffer_mapping;
 665         } else {
 666                 BUG_ON(mapping->assoc_mapping != buffer_mapping);
 667         }
 668         if (list_empty(&bh->b_assoc_buffers)) {
 669                 spin_lock(&buffer_mapping->private_lock);
 670                 list_move_tail(&bh->b_assoc_buffers,
 671                                 &mapping->private_list);
 672                 bh->b_assoc_map = mapping;
 673                 spin_unlock(&buffer_mapping->private_lock);
 674         }
 675 }
 676 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 677
 678 /*
 679  * Add a page to the dirty page list.
 680  *
 681  * It is a sad fact of life that this function is called from several places
 682  * deeply under spinlocking.  It may not sleep.
 683  *
 684  * If the page has buffers, the uptodate buffers are set dirty, to preserve
 685  * dirty-state coherency between the page and the buffers.  It the page does
 686  * not have buffers then when they are later attached they will all be set
 687  * dirty.
 688  *
 689  * The buffers are dirtied before the page is dirtied.  There's a small race
 690  * window in which a writepage caller may see the page cleanness but not the
 691  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 692  * before the buffers, a concurrent writepage caller could clear the page dirty
 693  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 694  * page on the dirty page list.
 695  *
 696  * We use private_lock to lock against try_to_free_buffers while using the
 697  * page's buffer list.  Also use this to protect against clean buffers being
 698  * added to the page after it was set dirty.
 699  *
 700  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 701  * address_space though.
 702  */
 703 int __set_page_dirty_buffers(struct page *page)
 704 {
 705         struct address_space * const mapping = page_mapping(page);
 706
 707         if (unlikely(!mapping))
 708                 return !TestSetPageDirty(page);
 709
 710         spin_lock(&mapping->private_lock);
 711         if (page_has_buffers(page)) {
 712                 struct buffer_head *head = page_buffers(page);
 713                 struct buffer_head *bh = head;
 714
 715                 do {
 716                         set_buffer_dirty(bh);
 717                         bh = bh->b_this_page;
 718                 } while (bh != head);
 719         }
 720         spin_unlock(&mapping->private_lock);
 721
 722         if (TestSetPageDirty(page))
 723                 return 0;
 724
 725         write_lock_irq(&mapping->tree_lock);
 726         if (page->mapping) {    /* Race with truncate? */
 727                 if (mapping_cap_account_dirty(mapping)) {
 728                         __inc_zone_page_state(page, NR_FILE_DIRTY);
 729                         __inc_bdi_stat(mapping->backing_dev_info,
 730                                         BDI_RECLAIMABLE);
 731                         task_io_account_write(PAGE_CACHE_SIZE);
 732                 }
 733                 radix_tree_tag_set(&mapping->page_tree,
 734                                 page_index(page), PAGECACHE_TAG_DIRTY);
 735         }
 736         write_unlock_irq(&mapping->tree_lock);
 737         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 738         return 1;
 739 }
 740 EXPORT_SYMBOL(__set_page_dirty_buffers);
 741
 742 /*
 743  * Write out and wait upon a list of buffers.
 744  *
 745  * We have conflicting pressures: we want to make sure that all
 746  * initially dirty buffers get waited on, but that any subsequently
 747  * dirtied buffers don't.  After all, we don't want fsync to last
 748  * forever if somebody is actively writing to the file.
 749  *
 750  * Do this in two main stages: first we copy dirty buffers to a
 751  * temporary inode list, queueing the writes as we go.  Then we clean
 752  * up, waiting for those writes to complete.
 753  *
 754  * During this second stage, any subsequent updates to the file may end
 755  * up refiling the buffer on the original inode's dirty list again, so
 756  * there is a chance we will end up with a buffer queued for write but
 757  * not yet completed on that list.  So, as a final cleanup we go through
 758  * the osync code to catch these locked, dirty buffers without requeuing
 759  * any newly dirty buffers for write.
 760  */
 761 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 762 {
 763         struct buffer_head *bh;
 764         struct list_head tmp;
 765         int err = 0, err2;
 766
 767         INIT_LIST_HEAD(&tmp);
 768
 769         spin_lock(lock);
 770         while (!list_empty(list)) {
 771                 bh = BH_ENTRY(list->next);
 772                 __remove_assoc_queue(bh);
 773                 if (buffer_dirty(bh) || buffer_locked(bh)) {
 774                         list_add(&bh->b_assoc_buffers, &tmp);
 775                         if (buffer_dirty(bh)) {
 776                                 get_bh(bh);
 777                                 spin_unlock(lock);
 778                                 /*
 779                                  * Ensure any pending I/O completes so that
 780                                  * ll_rw_block() actually writes the current
 781                                  * contents - it is a noop if I/O is still in
 782                                  * flight on potentially older contents.
 783                                  */
 784                                 ll_rw_block(SWRITE, 1, &bh);
 785                                 brelse(bh);
 786                                 spin_lock(lock);
 787                         }
 788                 }
 789         }
 790
 791         while (!list_empty(&tmp)) {
 792                 bh = BH_ENTRY(tmp.prev);
 793                 list_del_init(&bh->b_assoc_buffers);
 794                 get_bh(bh);
 795                 spin_unlock(lock);
 796                 wait_on_buffer(bh);
 797                 if (!buffer_uptodate(bh))
 798                         err = -EIO;
 799                 brelse(bh);
 800                 spin_lock(lock);
 801         }
 802
 803         spin_unlock(lock);
 804         err2 = osync_buffers_list(lock, list);
 805         if (err)
 806                 return err;
 807         else
 808                 return err2;
 809 }
 810
 811 /*
 812  * Invalidate any and all dirty buffers on a given inode.  We are
 813  * probably unmounting the fs, but that doesn't mean we have already
 814  * done a sync().  Just drop the buffers from the inode list.
 815  *
 816  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 817  * assumes that all the buffers are against the blockdev.  Not true
 818  * for reiserfs.
 819  */
 820 void invalidate_inode_buffers(struct inode *inode)
 821 {
 822         if (inode_has_buffers(inode)) {
 823                 struct address_space *mapping = &inode->i_data;
 824                 struct list_head *list = &mapping->private_list;
 825                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 826
 827                 spin_lock(&buffer_mapping->private_lock);
 828                 while (!list_empty(list))
 829                         __remove_assoc_queue(BH_ENTRY(list->next));
 830                 spin_unlock(&buffer_mapping->private_lock);
 831         }
 832 }
 833
 834 /*
 835  * Remove any clean buffers from the inode's buffer list.  This is called
 836  * when we're trying to free the inode itself.  Those buffers can pin it.
 837  *
 838  * Returns true if all buffers were removed.
 839  */
 840 int remove_inode_buffers(struct inode *inode)
 841 {
 842         int ret = 1;
 843
 844         if (inode_has_buffers(inode)) {
 845                 struct address_space *mapping = &inode->i_data;
 846                 struct list_head *list = &mapping->private_list;
 847                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 848
 849                 spin_lock(&buffer_mapping->private_lock);
 850                 while (!list_empty(list)) {
 851                         struct buffer_head *bh = BH_ENTRY(list->next);
 852                         if (buffer_dirty(bh)) {
 853                                 ret = 0;
 854                                 break;
 855                         }
 856                         __remove_assoc_queue(bh);
 857                 }
 858                 spin_unlock(&buffer_mapping->private_lock);
 859         }
 860         return ret;
 861 }
 862
 863 /*
 864  * Create the appropriate buffers when given a page for data area and
 865  * the size of each buffer.. Use the bh->b_this_page linked list to
 866  * follow the buffers created.  Return NULL if unable to create more
 867  * buffers.
 868  *
 869  * The retry flag is used to differentiate async IO (paging, swapping)
 870  * which may not fail from ordinary buffer allocations.
 871  */
 872 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 873                 int retry)
 874 {
 875         struct buffer_head *bh, *head;
 876         long offset;
 877
 878 try_again:
 879         head = NULL;
 880         offset = PAGE_SIZE;
 881         while ((offset -= size) >= 0) {
 882                 bh = alloc_buffer_head(GFP_NOFS);
 883                 if (!bh)
 884                         goto no_grow;
 885
 886                 bh->b_bdev = NULL;
 887                 bh->b_this_page = head;
 888                 bh->b_blocknr = -1;
 889                 head = bh;
 890
 891                 bh->b_state = 0;
 892                 atomic_set(&bh->b_count, 0);
 893                 bh->b_private = NULL;
 894                 bh->b_size = size;
 895
 896                 /* Link the buffer to its page */
 897                 set_bh_page(bh, page, offset);
 898
 899                 init_buffer(bh, NULL, NULL);
 900         }
 901         return head;
 902 /*
 903  * In case anything failed, we just free everything we got.
 904  */
 905 no_grow:
 906         if (head) {
 907                 do {
 908                         bh = head;
 909                         head = head->b_this_page;
 910                         free_buffer_head(bh);
 911                 } while (head);
 912         }
 913
 914         /*
 915          * Return failure for non-async IO requests.  Async IO requests
 916          * are not allowed to fail, so we have to wait until buffer heads
 917          * become available.  But we don't want tasks sleeping with
 918          * partially complete buffers, so all were released above.
 919          */
 920         if (!retry)
 921                 return NULL;
 922
 923         /* We're _really_ low on memory. Now we just
 924          * wait for old buffer heads to become free due to
 925          * finishing IO.  Since this is an async request and
 926          * the reserve list is empty, we're sure there are
 927          * async buffer heads in use.
 928          */
 929         free_more_memory();
 930         goto try_again;
 931 }
 932 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 933
 934 static inline void
 935 link_dev_buffers(struct page *page, struct buffer_head *head)
 936 {
 937         struct buffer_head *bh, *tail;
 938
 939         bh = head;
 940         do {
 941                 tail = bh;
 942                 bh = bh->b_this_page;
 943         } while (bh);
 944         tail->b_this_page = head;
 945         attach_page_buffers(page, head);
 946 }
 947
 948 /*
 949  * Initialise the state of a blockdev page's buffers.
 950  */
 951 static void
 952 init_page_buffers(struct page *page, struct block_device *bdev,
 953                         sector_t block, int size)
 954 {
 955         struct buffer_head *head = page_buffers(page);
 956         struct buffer_head *bh = head;
 957         int uptodate = PageUptodate(page);
 958
 959         do {
 960                 if (!buffer_mapped(bh)) {
 961                         init_buffer(bh, NULL, NULL);
 962                         bh->b_bdev = bdev;
 963                         bh->b_blocknr = block;
 964                         if (uptodate)
 965                                 set_buffer_uptodate(bh);
 966                         set_buffer_mapped(bh);
 967                 }
 968                 block++;
 969                 bh = bh->b_this_page;
 970         } while (bh != head);
 971 }
 972
 973 /*
 974  * Create the page-cache page that contains the requested block.
 975  *
 976  * This is user purely for blockdev mappings.
 977  */
 978 static struct page *
 979 grow_dev_page(struct block_device *bdev, sector_t block,
 980                 pgoff_t index, int size)
 981 {
 982         struct inode *inode = bdev->bd_inode;
 983         struct page *page;
 984         struct buffer_head *bh;
 985
 986         page = find_or_create_page(inode->i_mapping, index,
 987                 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
 988         if (!page)
 989                 return NULL;
 990
 991         BUG_ON(!PageLocked(page));
 992
 993         if (page_has_buffers(page)) {
 994                 bh = page_buffers(page);
 995                 if (bh->b_size == size) {
 996                         init_page_buffers(page, bdev, block, size);
 997                         return page;
 998                 }
 999                 if (!try_to_free_buffers(page))
1000                         goto failed;
1001         }
1002
1003         /*
1004          * Allocate some buffers for this page
1005          */
1006         bh = alloc_page_buffers(page, size, 0);
1007         if (!bh)
1008                 goto failed;
1009
1010         /*
1011          * Link the page to the buffers and initialise them.  Take the
1012          * lock to be atomic wrt __find_get_block(), which does not
1013          * run under the page lock.
1014          */
1015         spin_lock(&inode->i_mapping->private_lock);
1016         link_dev_buffers(page, bh);
1017         init_page_buffers(page, bdev, block, size);
1018         spin_unlock(&inode->i_mapping->private_lock);
1019         return page;
1020
1021 failed:
1022         BUG();
1023         unlock_page(page);
1024         page_cache_release(page);
1025         return NULL;
1026 }
1027
1028 /*
1029  * Create buffers for the specified block device block's page.  If
1030  * that page was dirty, the buffers are set dirty also.
1031  *
1032  * Except that's a bug.  Attaching dirty buffers to a dirty
1033  * blockdev's page can result in filesystem corruption, because
1034  * some of those buffers may be aliases of filesystem data.
1035  * grow_dev_page() will go BUG() if this happens.
1036  */
1037 static int
1038 grow_buffers(struct block_device *bdev, sector_t block, int size)
1039 {
1040         struct page *page;
1041         pgoff_t index;
1042         int sizebits;
1043
1044         sizebits = -1;
1045         do {
1046                 sizebits++;
1047         } while ((size << sizebits) < PAGE_SIZE);
1048
1049         index = block >> sizebits;
1050
1051         /*
1052          * Check for a block which wants to lie outside our maximum possible
1053          * pagecache index.  (this comparison is done using sector_t types).
1054          */
1055         if (unlikely(index != block >> sizebits)) {
1056                 char b[BDEVNAME_SIZE];
1057
1058                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1059                         "device %s\n",
1060                         __FUNCTION__, (unsigned long long)block,
1061                         bdevname(bdev, b));
1062                 return -EIO;
1063         }
1064         block = index << sizebits;
1065         /* Create a page with the proper size buffers.. */
1066         page = grow_dev_page(bdev, block, index, size);
1067         if (!page)
1068                 return 0;
1069         unlock_page(page);
1070         page_cache_release(page);
1071         return 1;
1072 }
1073
1074 static struct buffer_head *
1075 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1076 {
1077         /* Size must be multiple of hard sectorsize */
1078         if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1079                         (size < 512 || size > PAGE_SIZE))) {
1080                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1081                                         size);
1082                 printk(KERN_ERR "hardsect size: %d\n",
1083                                         bdev_hardsect_size(bdev));
1084
1085                 dump_stack();
1086                 return NULL;
1087         }
1088
1089         for (;;) {
1090                 struct buffer_head * bh;
1091                 int ret;
1092
1093                 bh = __find_get_block(bdev, block, size);
1094                 if (bh)
1095                         return bh;
1096
1097                 ret = grow_buffers(bdev, block, size);
1098                 if (ret < 0)
1099                         return NULL;
1100                 if (ret == 0)
1101                         free_more_memory();
1102         }
1103 }
1104
1105 /*
1106  * The relationship between dirty buffers and dirty pages:
1107  *
1108  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1109  * the page is tagged dirty in its radix tree.
1110  *
1111  * At all times, the dirtiness of the buffers represents the dirtiness of
1112  * subsections of the page.  If the page has buffers, the page dirty bit is
1113  * merely a hint about the true dirty state.
1114  *
1115  * When a page is set dirty in its entirety, all its buffers are marked dirty
1116  * (if the page has buffers).
1117  *
1118  * When a buffer is marked dirty, its page is dirtied, but the page's other
1119  * buffers are not.
1120  *
1121  * Also.  When blockdev buffers are explicitly read with bread(), they
1122  * individually become uptodate.  But their backing page remains not
1123  * uptodate - even if all of its buffers are uptodate.  A subsequent
1124  * block_read_full_page() against that page will discover all the uptodate
1125  * buffers, will set the page uptodate and will perform no I/O.
1126  */
1127
1128 /**
1129  * mark_buffer_dirty - mark a buffer_head as needing writeout
1130  * @bh: the buffer_head to mark dirty
1131  *
1132  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1133  * backing page dirty, then tag the page as dirty in its address_space's radix
1134  * tree and then attach the address_space's inode to its superblock's dirty
1135  * inode list.
1136  *
1137  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1138  * mapping->tree_lock and the global inode_lock.
1139  */
1140 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1141 {
1142         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1143                 __set_page_dirty_nobuffers(bh->b_page);
1144 }
1145
1146 /*
1147  * Decrement a buffer_head's reference count.  If all buffers against a page
1148  * have zero reference count, are clean and unlocked, and if the page is clean
1149  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1150  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1151  * a page but it ends up not being freed, and buffers may later be reattached).
1152  */
1153 void __brelse(struct buffer_head * buf)
1154 {
1155         if (atomic_read(&buf->b_count)) {
1156                 put_bh(buf);
1157                 return;
1158         }
1159         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1160         WARN_ON(1);
1161 }
1162
1163 /*
1164  * bforget() is like brelse(), except it discards any
1165  * potentially dirty data.
1166  */
1167 void __bforget(struct buffer_head *bh)
1168 {
1169         clear_buffer_dirty(bh);
1170         if (!list_empty(&bh->b_assoc_buffers)) {
1171                 struct address_space *buffer_mapping = bh->b_page->mapping;
1172
1173                 spin_lock(&buffer_mapping->private_lock);
1174                 list_del_init(&bh->b_assoc_buffers);
1175                 bh->b_assoc_map = NULL;
1176                 spin_unlock(&buffer_mapping->private_lock);
1177         }
1178         __brelse(bh);
1179 }
1180
1181 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1182 {
1183         lock_buffer(bh);
1184         if (buffer_uptodate(bh)) {
1185                 unlock_buffer(bh);
1186                 return bh;
1187         } else {
1188                 get_bh(bh);
1189                 bh->b_end_io = end_buffer_read_sync;
1190                 submit_bh(READ, bh);
1191                 wait_on_buffer(bh);
1192                 if (buffer_uptodate(bh))
1193                         return bh;
1194         }
1195         brelse(bh);
1196         return NULL;
1197 }
1198
1199 /*
1200  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1201  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1202  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1203  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1204  * CPU's LRUs at the same time.
1205  *
1206  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1207  * sb_find_get_block().
1208  *
1209  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1210  * a local interrupt disable for that.
1211  */
1212
1213 #define BH_LRU_SIZE     8
1214
1215 struct bh_lru {
1216         struct buffer_head *bhs[BH_LRU_SIZE];
1217 };
1218
1219 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1220
1221 #ifdef CONFIG_SMP
1222 #define bh_lru_lock()   local_irq_disable()
1223 #define bh_lru_unlock() local_irq_enable()
1224 #else
1225 #define bh_lru_lock()   preempt_disable()
1226 #define bh_lru_unlock() preempt_enable()
1227 #endif
1228
1229 static inline void check_irqs_on(void)
1230 {
1231 #ifdef irqs_disabled
1232         BUG_ON(irqs_disabled());
1233 #endif
1234 }
1235
1236 /*
1237  * The LRU management algorithm is dopey-but-simple.  Sorry.
1238  */
1239 static void bh_lru_install(struct buffer_head *bh)
1240 {
1241         struct buffer_head *evictee = NULL;
1242         struct bh_lru *lru;
1243
1244         check_irqs_on();
1245         bh_lru_lock();
1246         lru = &__get_cpu_var(bh_lrus);
1247         if (lru->bhs[0] != bh) {
1248                 struct buffer_head *bhs[BH_LRU_SIZE];
1249                 int in;
1250                 int out = 0;
1251
1252                 get_bh(bh);
1253                 bhs[out++] = bh;
1254                 for (in = 0; in < BH_LRU_SIZE; in++) {
1255                         struct buffer_head *bh2 = lru->bhs[in];
1256
1257                         if (bh2 == bh) {
1258                                 __brelse(bh2);
1259                         } else {
1260                                 if (out >= BH_LRU_SIZE) {
1261                                         BUG_ON(evictee != NULL);
1262                                         evictee = bh2;
1263                                 } else {
1264                                         bhs[out++] = bh2;
1265                                 }
1266                         }
1267                 }
1268                 while (out < BH_LRU_SIZE)
1269                         bhs[out++] = NULL;
1270                 memcpy(lru->bhs, bhs, sizeof(bhs));
1271         }
1272         bh_lru_unlock();
1273
1274         if (evictee)
1275                 __brelse(evictee);
1276 }
1277
1278 /*
1279  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1280  */
1281 static struct buffer_head *
1282 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1283 {
1284         struct buffer_head *ret = NULL;
1285         struct bh_lru *lru;
1286         unsigned int i;
1287
1288         check_irqs_on();
1289         bh_lru_lock();
1290         lru = &__get_cpu_var(bh_lrus);
1291         for (i = 0; i < BH_LRU_SIZE; i++) {
1292                 struct buffer_head *bh = lru->bhs[i];
1293
1294                 if (bh && bh->b_bdev == bdev &&
1295                                 bh->b_blocknr == block && bh->b_size == size) {
1296                         if (i) {
1297                                 while (i) {
1298                                         lru->bhs[i] = lru->bhs[i - 1];
1299                                         i--;
1300                                 }
1301                                 lru->bhs[0] = bh;
1302                         }
1303                         get_bh(bh);
1304                         ret = bh;
1305                         break;
1306                 }
1307         }
1308         bh_lru_unlock();
1309         return ret;
1310 }
1311
1312 /*
1313  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1314  * it in the LRU and mark it as accessed.  If it is not present then return
1315  * NULL
1316  */
1317 struct buffer_head *
1318 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1319 {
1320         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1321
1322         if (bh == NULL) {
1323                 bh = __find_get_block_slow(bdev, block);
1324                 if (bh)
1325                         bh_lru_install(bh);
1326         }
1327         if (bh)
1328                 touch_buffer(bh);
1329         return bh;
1330 }
1331 EXPORT_SYMBOL(__find_get_block);
1332
1333 /*
1334  * __getblk will locate (and, if necessary, create) the buffer_head
1335  * which corresponds to the passed block_device, block and size. The
1336  * returned buffer has its reference count incremented.
1337  *
1338  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1339  * illegal block number, __getblk() will happily return a buffer_head
1340  * which represents the non-existent block.  Very weird.
1341  *
1342  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1343  * attempt is failing.  FIXME, perhaps?
1344  */
1345 struct buffer_head *
1346 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1347 {
1348         struct buffer_head *bh = __find_get_block(bdev, block, size);
1349
1350         might_sleep();
1351         if (bh == NULL)
1352                 bh = __getblk_slow(bdev, block, size);
1353         return bh;
1354 }
1355 EXPORT_SYMBOL(__getblk);
1356
1357 /*
1358  * Do async read-ahead on a buffer..
1359  */
1360 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1361 {
1362         struct buffer_head *bh = __getblk(bdev, block, size);
1363         if (likely(bh)) {
1364                 ll_rw_block(READA, 1, &bh);
1365                 brelse(bh);
1366         }
1367 }
1368 EXPORT_SYMBOL(__breadahead);
1369
1370 /**
1371  *  __bread() - reads a specified block and returns the bh
1372  *  @bdev: the block_device to read from
1373  *  @block: number of block
1374  *  @size: size (in bytes) to read
1375  *
1376  *  Reads a specified block, and returns buffer head that contains it.
1377  *  It returns NULL if the block was unreadable.
1378  */
1379 struct buffer_head *
1380 __bread(struct block_device *bdev, sector_t block, unsigned size)
1381 {
1382         struct buffer_head *bh = __getblk(bdev, block, size);
1383
1384         if (likely(bh) && !buffer_uptodate(bh))
1385                 bh = __bread_slow(bh);
1386         return bh;
1387 }
1388 EXPORT_SYMBOL(__bread);
1389
1390 /*
1391  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1392  * This doesn't race because it runs in each cpu either in irq
1393  * or with preempt disabled.
1394  */
1395 static void invalidate_bh_lru(void *arg)
1396 {
1397         struct bh_lru *b = &get_cpu_var(bh_lrus);
1398         int i;
1399
1400         for (i = 0; i < BH_LRU_SIZE; i++) {
1401                 brelse(b->bhs[i]);
1402                 b->bhs[i] = NULL;
1403         }
1404         put_cpu_var(bh_lrus);
1405 }
1406
1407 void invalidate_bh_lrus(void)
1408 {
1409         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1410 }
1411
1412 void set_bh_page(struct buffer_head *bh,
1413                 struct page *page, unsigned long offset)
1414 {
1415         bh->b_page = page;
1416         BUG_ON(offset >= PAGE_SIZE);
1417         if (PageHighMem(page))
1418                 /*
1419                  * This catches illegal uses and preserves the offset:
1420                  */
1421                 bh->b_data = (char *)(0 + offset);
1422         else
1423                 bh->b_data = page_address(page) + offset;
1424 }
1425 EXPORT_SYMBOL(set_bh_page);
1426
1427 /*
1428  * Called when truncating a buffer on a page completely.
1429  */
1430 static void discard_buffer(struct buffer_head * bh)
1431 {
1432         lock_buffer(bh);
1433         clear_buffer_dirty(bh);
1434         bh->b_bdev = NULL;
1435         clear_buffer_mapped(bh);
1436         clear_buffer_req(bh);
1437         clear_buffer_new(bh);
1438         clear_buffer_delay(bh);
1439         clear_buffer_unwritten(bh);
1440         unlock_buffer(bh);
1441 }
1442
1443 /**
1444  * block_invalidatepage - invalidate part of all of a buffer-backed page
1445  *
1446  * @page: the page which is affected
1447  * @offset: the index of the truncation point
1448  *
1449  * block_invalidatepage() is called when all or part of the page has become
1450  * invalidatedby a truncate operation.
1451  *
1452  * block_invalidatepage() does not have to release all buffers, but it must
1453  * ensure that no dirty buffer is left outside @offset and that no I/O
1454  * is underway against any of the blocks which are outside the truncation
1455  * point.  Because the caller is about to free (and possibly reuse) those
1456  * blocks on-disk.
1457  */
1458 void block_invalidatepage(struct page *page, unsigned long offset)
1459 {
1460         struct buffer_head *head, *bh, *next;
1461         unsigned int curr_off = 0;
1462
1463         BUG_ON(!PageLocked(page));
1464         if (!page_has_buffers(page))
1465                 goto out;
1466
1467         head = page_buffers(page);
1468         bh = head;
1469         do {
1470                 unsigned int next_off = curr_off + bh->b_size;
1471                 next = bh->b_this_page;
1472
1473                 /*
1474                  * is this block fully invalidated?
1475                  */
1476                 if (offset <= curr_off)
1477                         discard_buffer(bh);
1478                 curr_off = next_off;
1479                 bh = next;
1480         } while (bh != head);
1481
1482         /*
1483          * We release buffers only if the entire page is being invalidated.
1484          * The get_block cached value has been unconditionally invalidated,
1485          * so real IO is not possible anymore.
1486          */
1487         if (offset == 0)
1488                 try_to_release_page(page, 0);
1489 out:
1490         return;
1491 }
1492 EXPORT_SYMBOL(block_invalidatepage);
1493
1494 /*
1495  * We attach and possibly dirty the buffers atomically wrt
1496  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1497  * is already excluded via the page lock.
1498  */
1499 void create_empty_buffers(struct page *page,
1500                         unsigned long blocksize, unsigned long b_state)
1501 {
1502         struct buffer_head *bh, *head, *tail;
1503
1504         head = alloc_page_buffers(page, blocksize, 1);
1505         bh = head;
1506         do {
1507                 bh->b_state |= b_state;
1508                 tail = bh;
1509                 bh = bh->b_this_page;
1510         } while (bh);
1511         tail->b_this_page = head;
1512
1513         spin_lock(&page->mapping->private_lock);
1514         if (PageUptodate(page) || PageDirty(page)) {
1515                 bh = head;
1516                 do {
1517                         if (PageDirty(page))
1518                                 set_buffer_dirty(bh);
1519                         if (PageUptodate(page))
1520                                 set_buffer_uptodate(bh);
1521                         bh = bh->b_this_page;
1522                 } while (bh != head);
1523         }
1524         attach_page_buffers(page, head);
1525         spin_unlock(&page->mapping->private_lock);
1526 }
1527 EXPORT_SYMBOL(create_empty_buffers);
1528
1529 /*
1530  * We are taking a block for data and we don't want any output from any
1531  * buffer-cache aliases starting from return from that function and
1532  * until the moment when something will explicitly mark the buffer
1533  * dirty (hopefully that will not happen until we will free that block ;-)
1534  * We don't even need to mark it not-uptodate - nobody can expect
1535  * anything from a newly allocated buffer anyway. We used to used
1536  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1537  * don't want to mark the alias unmapped, for example - it would confuse
1538  * anyone who might pick it with bread() afterwards...
1539  *
1540  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1541  * be writeout I/O going on against recently-freed buffers.  We don't
1542  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1543  * only if we really need to.  That happens here.
1544  */
1545 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1546 {
1547         struct buffer_head *old_bh;
1548
1549         might_sleep();
1550
1551         old_bh = __find_get_block_slow(bdev, block);
1552         if (old_bh) {
1553                 clear_buffer_dirty(old_bh);
1554                 wait_on_buffer(old_bh);
1555                 clear_buffer_req(old_bh);
1556                 __brelse(old_bh);
1557         }
1558 }
1559 EXPORT_SYMBOL(unmap_underlying_metadata);
1560
1561 /*
1562  * NOTE! All mapped/uptodate combinations are valid:
1563  *
1564  *      Mapped  Uptodate        Meaning
1565  *
1566  *      No      No              "unknown" - must do get_block()
1567  *      No      Yes             "hole" - zero-filled
1568  *      Yes     No              "allocated" - allocated on disk, not read in
1569  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1570  *
1571  * "Dirty" is valid only with the last case (mapped+uptodate).
1572  */
1573
1574 /*
1575  * While block_write_full_page is writing back the dirty buffers under
1576  * the page lock, whoever dirtied the buffers may decide to clean them
1577  * again at any time.  We handle that by only looking at the buffer
1578  * state inside lock_buffer().
1579  *
1580  * If block_write_full_page() is called for regular writeback
1581  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1582  * locked buffer.   This only can happen if someone has written the buffer
1583  * directly, with submit_bh().  At the address_space level PageWriteback
1584  * prevents this contention from occurring.
1585  */
1586 static int __block_write_full_page(struct inode *inode, struct page *page,
1587                         get_block_t *get_block, struct writeback_control *wbc)
1588 {
1589         int err;
1590         sector_t block;
1591         sector_t last_block;
1592         struct buffer_head *bh, *head;
1593         const unsigned blocksize = 1 << inode->i_blkbits;
1594         int nr_underway = 0;
1595
1596         BUG_ON(!PageLocked(page));
1597
1598         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1599
1600         if (!page_has_buffers(page)) {
1601                 create_empty_buffers(page, blocksize,
1602                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1603         }
1604
1605         /*
1606          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1607          * here, and the (potentially unmapped) buffers may become dirty at
1608          * any time.  If a buffer becomes dirty here after we've inspected it
1609          * then we just miss that fact, and the page stays dirty.
1610          *
1611          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1612          * handle that here by just cleaning them.
1613          */
1614
1615         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1616         head = page_buffers(page);
1617         bh = head;
1618
1619         /*
1620          * Get all the dirty buffers mapped to disk addresses and
1621          * handle any aliases from the underlying blockdev's mapping.
1622          */
1623         do {
1624                 if (block > last_block) {
1625                         /*
1626                          * mapped buffers outside i_size will occur, because
1627                          * this page can be outside i_size when there is a
1628                          * truncate in progress.
1629                          */
1630                         /*
1631                          * The buffer was zeroed by block_write_full_page()
1632                          */
1633                         clear_buffer_dirty(bh);
1634                         set_buffer_uptodate(bh);
1635                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1636                         WARN_ON(bh->b_size != blocksize);
1637                         err = get_block(inode, block, bh, 1);
1638                         if (err)
1639                                 goto recover;
1640                         if (buffer_new(bh)) {
1641                                 /* blockdev mappings never come here */
1642                                 clear_buffer_new(bh);
1643                                 unmap_underlying_metadata(bh->b_bdev,
1644                                                         bh->b_blocknr);
1645                         }
1646                 }
1647                 bh = bh->b_this_page;
1648                 block++;
1649         } while (bh != head);
1650
1651         do {
1652                 if (!buffer_mapped(bh))
1653                         continue;
1654                 /*
1655                  * If it's a fully non-blocking write attempt and we cannot
1656                  * lock the buffer then redirty the page.  Note that this can
1657                  * potentially cause a busy-wait loop from pdflush and kswapd
1658                  * activity, but those code paths have their own higher-level
1659                  * throttling.
1660                  */
1661                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1662                         lock_buffer(bh);
1663                 } else if (test_set_buffer_locked(bh)) {
1664                         redirty_page_for_writepage(wbc, page);
1665                         continue;
1666                 }
1667                 if (test_clear_buffer_dirty(bh)) {
1668                         mark_buffer_async_write(bh);
1669                 } else {
1670                         unlock_buffer(bh);
1671                 }
1672         } while ((bh = bh->b_this_page) != head);
1673
1674         /*
1675          * The page and its buffers are protected by PageWriteback(), so we can
1676          * drop the bh refcounts early.
1677          */
1678         BUG_ON(PageWriteback(page));
1679         set_page_writeback(page);
1680
1681         do {
1682                 struct buffer_head *next = bh->b_this_page;
1683                 if (buffer_async_write(bh)) {
1684                         submit_bh(WRITE, bh);
1685                         nr_underway++;
1686                 }
1687                 bh = next;
1688         } while (bh != head);
1689         unlock_page(page);
1690
1691         err = 0;
1692 done:
1693         if (nr_underway == 0) {
1694                 /*
1695                  * The page was marked dirty, but the buffers were
1696                  * clean.  Someone wrote them back by hand with
1697                  * ll_rw_block/submit_bh.  A rare case.
1698                  */
1699                 end_page_writeback(page);
1700
1701                 /*
1702                  * The page and buffer_heads can be released at any time from
1703                  * here on.
1704                  */
1705                 wbc->pages_skipped++;   /* We didn't write this page */
1706         }
1707         return err;
1708
1709 recover:
1710         /*
1711          * ENOSPC, or some other error.  We may already have added some
1712          * blocks to the file, so we need to write these out to avoid
1713          * exposing stale data.
1714          * The page is currently locked and not marked for writeback
1715          */
1716         bh = head;
1717         /* Recovery: lock and submit the mapped buffers */
1718         do {
1719                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1720                         lock_buffer(bh);
1721                         mark_buffer_async_write(bh);
1722                 } else {
1723                         /*
1724                          * The buffer may have been set dirty during
1725                          * attachment to a dirty page.
1726                          */
1727                         clear_buffer_dirty(bh);
1728                 }
1729         } while ((bh = bh->b_this_page) != head);
1730         SetPageError(page);
1731         BUG_ON(PageWriteback(page));
1732         mapping_set_error(page->mapping, err);
1733         set_page_writeback(page);
1734         do {
1735                 struct buffer_head *next = bh->b_this_page;
1736                 if (buffer_async_write(bh)) {
1737                         clear_buffer_dirty(bh);
1738                         submit_bh(WRITE, bh);
1739                         nr_underway++;
1740                 }
1741                 bh = next;
1742         } while (bh != head);
1743         unlock_page(page);
1744         goto done;
1745 }
1746
1747 static int __block_prepare_write(struct inode *inode, struct page *page,
1748                 unsigned from, unsigned to, get_block_t *get_block)
1749 {
1750         unsigned block_start, block_end;
1751         sector_t block;
1752         int err = 0;
1753         unsigned blocksize, bbits;
1754         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1755
1756         BUG_ON(!PageLocked(page));
1757         BUG_ON(from > PAGE_CACHE_SIZE);
1758         BUG_ON(to > PAGE_CACHE_SIZE);
1759         BUG_ON(from > to);
1760
1761         blocksize = 1 << inode->i_blkbits;
1762         if (!page_has_buffers(page))
1763                 create_empty_buffers(page, blocksize, 0);
1764         head = page_buffers(page);
1765
1766         bbits = inode->i_blkbits;
1767         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1768
1769         for(bh = head, block_start = 0; bh != head || !block_start;
1770             block++, block_start=block_end, bh = bh->b_this_page) {
1771                 block_end = block_start + blocksize;
1772                 if (block_end <= from || block_start >= to) {
1773                         if (PageUptodate(page)) {
1774                                 if (!buffer_uptodate(bh))
1775                                         set_buffer_uptodate(bh);
1776                         }
1777                         continue;
1778                 }
1779                 if (buffer_new(bh))
1780                         clear_buffer_new(bh);
1781                 if (!buffer_mapped(bh)) {
1782                         WARN_ON(bh->b_size != blocksize);
1783                         err = get_block(inode, block, bh, 1);
1784                         if (err)
1785                                 break;
1786                         if (buffer_new(bh)) {
1787                                 unmap_underlying_metadata(bh->b_bdev,
1788                                                         bh->b_blocknr);
1789                                 if (PageUptodate(page)) {
1790                                         set_buffer_uptodate(bh);
1791                                         continue;
1792                                 }
1793                                 if (block_end > to || block_start < from) {
1794                                         void *kaddr;
1795
1796                                         kaddr = kmap_atomic(page, KM_USER0);
1797                                         if (block_end > to)
1798                                                 memset(kaddr+to, 0,
1799                                                         block_end-to);
1800                                         if (block_start < from)
1801                                                 memset(kaddr+block_start,
1802                                                         0, from-block_start);
1803                                         flush_dcache_page(page);
1804                                         kunmap_atomic(kaddr, KM_USER0);
1805                                 }
1806                                 continue;
1807                         }
1808                 }
1809                 if (PageUptodate(page)) {
1810                         if (!buffer_uptodate(bh))
1811                                 set_buffer_uptodate(bh);
1812                         continue;
1813                 }
1814                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1815                     !buffer_unwritten(bh) &&
1816                      (block_start < from || block_end > to)) {
1817                         ll_rw_block(READ, 1, &bh);
1818                         *wait_bh++=bh;
1819                 }
1820         }
1821         /*
1822          * If we issued read requests - let them complete.
1823          */
1824         while(wait_bh > wait) {
1825                 wait_on_buffer(*--wait_bh);
1826                 if (!buffer_uptodate(*wait_bh))
1827                         err = -EIO;
1828         }
1829         if (!err) {
1830                 bh = head;
1831                 do {
1832                         if (buffer_new(bh))
1833                                 clear_buffer_new(bh);
1834                 } while ((bh = bh->b_this_page) != head);
1835                 return 0;
1836         }
1837         /* Error case: */
1838         /*
1839          * Zero out any newly allocated blocks to avoid exposing stale
1840          * data.  If BH_New is set, we know that the block was newly
1841          * allocated in the above loop.
1842          */
1843         bh = head;
1844         block_start = 0;
1845         do {
1846                 block_end = block_start+blocksize;
1847                 if (block_end <= from)
1848                         goto next_bh;
1849                 if (block_start >= to)
1850                         break;
1851                 if (buffer_new(bh)) {
1852                         clear_buffer_new(bh);
1853                         zero_user_page(page, block_start, bh->b_size, KM_USER0);
1854                         set_buffer_uptodate(bh);
1855                         mark_buffer_dirty(bh);
1856                 }
1857 next_bh:
1858                 block_start = block_end;
1859                 bh = bh->b_this_page;
1860         } while (bh != head);
1861         return err;
1862 }
1863
1864 static int __block_commit_write(struct inode *inode, struct page *page,
1865                 unsigned from, unsigned to)
1866 {
1867         unsigned block_start, block_end;
1868         int partial = 0;
1869         unsigned blocksize;
1870         struct buffer_head *bh, *head;
1871
1872         blocksize = 1 << inode->i_blkbits;
1873
1874         for(bh = head = page_buffers(page), block_start = 0;
1875             bh != head || !block_start;
1876             block_start=block_end, bh = bh->b_this_page) {
1877                 block_end = block_start + blocksize;
1878                 if (block_end <= from || block_start >= to) {
1879                         if (!buffer_uptodate(bh))
1880                                 partial = 1;
1881                 } else {
1882                         set_buffer_uptodate(bh);
1883                         mark_buffer_dirty(bh);
1884                 }
1885         }
1886
1887         /*
1888          * If this is a partial write which happened to make all buffers
1889          * uptodate then we can optimize away a bogus readpage() for
1890          * the next read(). Here we 'discover' whether the page went
1891          * uptodate as a result of this (potentially partial) write.
1892          */
1893         if (!partial)
1894                 SetPageUptodate(page);
1895         return 0;
1896 }
1897
1898 /*
1899  * block_is_partially_uptodate checks whether buffers within a page are
1900  * uptodate or not.
1901  *
1902  * Returns true if all buffers which correspond to a file portion
1903  * we want to read are uptodate.
1904  */
1905 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
1906                                         unsigned long from)
1907 {
1908         struct inode *inode = page->mapping->host;
1909         unsigned block_start, block_end, blocksize;
1910         unsigned to;
1911         struct buffer_head *bh, *head;
1912         int ret = 1;
1913
1914         if (!page_has_buffers(page))
1915                 return 0;
1916
1917         blocksize = 1 << inode->i_blkbits;
1918         to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
1919         to = from + to;
1920         if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
1921                 return 0;
1922
1923         head = page_buffers(page);
1924         bh = head;
1925         block_start = 0;
1926         do {
1927                 block_end = block_start + blocksize;
1928                 if (block_end > from && block_start < to) {
1929                         if (!buffer_uptodate(bh)) {
1930                                 ret = 0;
1931                                 break;
1932                         }
1933                         if (block_end >= to)
1934                                 break;
1935                 }
1936                 block_start = block_end;
1937                 bh = bh->b_this_page;
1938         } while (bh != head);
1939
1940         return ret;
1941 }
1942 EXPORT_SYMBOL(block_is_partially_uptodate);
1943
1944 /*
1945  * Generic "read page" function for block devices that have the normal
1946  * get_block functionality. This is most of the block device filesystems.
1947  * Reads the page asynchronously --- the unlock_buffer() and
1948  * set/clear_buffer_uptodate() functions propagate buffer state into the
1949  * page struct once IO has completed.
1950  */
1951 int block_read_full_page(struct page *page, get_block_t *get_block)
1952 {
1953         struct inode *inode = page->mapping->host;
1954         sector_t iblock, lblock;
1955         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1956         unsigned int blocksize;
1957         int nr, i;
1958         int fully_mapped = 1;
1959
1960         BUG_ON(!PageLocked(page));
1961         blocksize = 1 << inode->i_blkbits;
1962         if (!page_has_buffers(page))
1963                 create_empty_buffers(page, blocksize, 0);
1964         head = page_buffers(page);
1965
1966         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1967         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1968         bh = head;
1969         nr = 0;
1970         i = 0;
1971
1972         do {
1973                 if (buffer_uptodate(bh))
1974                         continue;
1975
1976                 if (!buffer_mapped(bh)) {
1977                         int err = 0;
1978
1979                         fully_mapped = 0;
1980                         if (iblock < lblock) {
1981                                 WARN_ON(bh->b_size != blocksize);
1982                                 err = get_block(inode, iblock, bh, 0);
1983                                 if (err)
1984                                         SetPageError(page);
1985                         }
1986                         if (!buffer_mapped(bh)) {
1987                                 zero_user_page(page, i * blocksize, blocksize,
1988                                                 KM_USER0);
1989                                 if (!err)
1990                                         set_buffer_uptodate(bh);
1991                                 continue;
1992                         }
1993                         /*
1994                          * get_block() might have updated the buffer
1995                          * synchronously
1996                          */
1997                         if (buffer_uptodate(bh))
1998                                 continue;
1999                 }
2000                 arr[nr++] = bh;
2001         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2002
2003         if (fully_mapped)
2004                 SetPageMappedToDisk(page);
2005
2006         if (!nr) {
2007                 /*
2008                  * All buffers are uptodate - we can set the page uptodate
2009                  * as well. But not if get_block() returned an error.
2010                  */
2011                 if (!PageError(page))
2012                         SetPageUptodate(page);
2013                 unlock_page(page);
2014                 return 0;
2015         }
2016
2017         /* Stage two: lock the buffers */
2018         for (i = 0; i < nr; i++) {
2019                 bh = arr[i];
2020                 lock_buffer(bh);
2021                 mark_buffer_async_read(bh);
2022         }
2023
2024         /*
2025          * Stage 3: start the IO.  Check for uptodateness
2026          * inside the buffer lock in case another process reading
2027          * the underlying blockdev brought it uptodate (the sct fix).
2028          */
2029         for (i = 0; i < nr; i++) {
2030                 bh = arr[i];
2031                 if (buffer_uptodate(bh))
2032                         end_buffer_async_read(bh, 1);
2033                 else
2034                         submit_bh(READ, bh);
2035         }
2036         return 0;
2037 }
2038
2039 /* utility function for filesystems that need to do work on expanding
2040  * truncates.  Uses prepare/commit_write to allow the filesystem to
2041  * deal with the hole.
2042  */
2043 static int __generic_cont_expand(struct inode *inode, loff_t size,
2044                                  pgoff_t index, unsigned int offset)
2045 {
2046         struct address_space *mapping = inode->i_mapping;
2047         struct page *page;
2048         unsigned long limit;
2049         int err;
2050
2051         err = -EFBIG;
2052         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2053         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2054                 send_sig(SIGXFSZ, current, 0);
2055                 goto out;
2056         }
2057         if (size > inode->i_sb->s_maxbytes)
2058                 goto out;
2059
2060         err = -ENOMEM;
2061         page = grab_cache_page(mapping, index);
2062         if (!page)
2063                 goto out;
2064         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2065         if (err) {
2066                 /*
2067                  * ->prepare_write() may have instantiated a few blocks
2068                  * outside i_size.  Trim these off again.
2069                  */
2070                 unlock_page(page);
2071                 page_cache_release(page);
2072                 vmtruncate(inode, inode->i_size);
2073                 goto out;
2074         }
2075
2076         err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2077
2078         unlock_page(page);
2079         page_cache_release(page);
2080         if (err > 0)
2081                 err = 0;
2082 out:
2083         return err;
2084 }
2085
2086 int generic_cont_expand(struct inode *inode, loff_t size)
2087 {
2088         pgoff_t index;
2089         unsigned int offset;
2090
2091         offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2092
2093         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2094         ** skip the prepare.  make sure we never send an offset for the start
2095         ** of a block
2096         */
2097         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2098                 /* caller must handle this extra byte. */
2099                 offset++;
2100         }
2101         index = size >> PAGE_CACHE_SHIFT;
2102
2103         return __generic_cont_expand(inode, size, index, offset);
2104 }
2105
2106 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2107 {
2108         loff_t pos = size - 1;
2109         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2110         unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2111
2112         /* prepare/commit_write can handle even if from==to==start of block. */
2113         return __generic_cont_expand(inode, size, index, offset);
2114 }
2115
2116 /*
2117  * For moronic filesystems that do not allow holes in file.
2118  * We may have to extend the file.
2119  */
2120
2121 int cont_prepare_write(struct page *page, unsigned offset,
2122                 unsigned to, get_block_t *get_block, loff_t *bytes)
2123 {
2124         struct address_space *mapping = page->mapping;
2125         struct inode *inode = mapping->host;
2126         struct page *new_page;
2127         pgoff_t pgpos;
2128         long status;
2129         unsigned zerofrom;
2130         unsigned blocksize = 1 << inode->i_blkbits;
2131
2132         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2133                 status = -ENOMEM;
2134                 new_page = grab_cache_page(mapping, pgpos);
2135                 if (!new_page)
2136                         goto out;
2137                 /* we might sleep */
2138                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2139                         unlock_page(new_page);
2140                         page_cache_release(new_page);
2141                         continue;
2142                 }
2143                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2144                 if (zerofrom & (blocksize-1)) {
2145                         *bytes |= (blocksize-1);
2146                         (*bytes)++;
2147                 }
2148                 status = __block_prepare_write(inode, new_page, zerofrom,
2149                                                 PAGE_CACHE_SIZE, get_block);
2150                 if (status)
2151                         goto out_unmap;
2152                 zero_user_page(new_page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
2153                                 KM_USER0);
2154                 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2155                 unlock_page(new_page);
2156                 page_cache_release(new_page);
2157
2158                 balance_dirty_pages_ratelimited(mapping);
2159         }
2160
2161         if (page->index < pgpos) {
2162                 /* completely inside the area */
2163                 zerofrom = offset;
2164         } else {
2165                 /* page covers the boundary, find the boundary offset */
2166                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2167
2168                 /* if we will expand the thing last block will be filled */
2169                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2170                         *bytes |= (blocksize-1);
2171                         (*bytes)++;
2172                 }
2173
2174                 /* starting below the boundary? Nothing to zero out */
2175                 if (offset <= zerofrom)
2176                         zerofrom = offset;
2177         }
2178         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2179         if (status)
2180                 goto out1;
2181         if (zerofrom < offset) {
2182                 zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
2183                 __block_commit_write(inode, page, zerofrom, offset);
2184         }
2185         return 0;
2186 out1:
2187         ClearPageUptodate(page);
2188         return status;
2189
2190 out_unmap:
2191         ClearPageUptodate(new_page);
2192         unlock_page(new_page);
2193         page_cache_release(new_page);
2194 out:
2195         return status;
2196 }
2197
2198 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2199                         get_block_t *get_block)
2200 {
2201         struct inode *inode = page->mapping->host;
2202         int err = __block_prepare_write(inode, page, from, to, get_block);
2203         if (err)
2204                 ClearPageUptodate(page);
2205         return err;
2206 }
2207
2208 int block_commit_write(struct page *page, unsigned from, unsigned to)
2209 {
2210         struct inode *inode = page->mapping->host;
2211         __block_commit_write(inode,page,from,to);
2212         return 0;
2213 }
2214
2215 int generic_commit_write(struct file *file, struct page *page,
2216                 unsigned from, unsigned to)
2217 {
2218         struct inode *inode = page->mapping->host;
2219         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2220         __block_commit_write(inode,page,from,to);
2221         /*
2222          * No need to use i_size_read() here, the i_size
2223          * cannot change under us because we hold i_mutex.
2224          */
2225         if (pos > inode->i_size) {
2226                 i_size_write(inode, pos);
2227                 mark_inode_dirty(inode);
2228         }
2229         return 0;
2230 }
2231
2232
2233 /*
2234  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2235  * immediately, while under the page lock.  So it needs a special end_io
2236  * handler which does not touch the bh after unlocking it.
2237  *
2238  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2239  * a race there is benign: unlock_buffer() only use the bh's address for
2240  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2241  * itself.
2242  */
2243 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2244 {
2245         if (uptodate) {
2246                 set_buffer_uptodate(bh);
2247         } else {
2248                 /* This happens, due to failed READA attempts. */
2249                 clear_buffer_uptodate(bh);
2250         }
2251         unlock_buffer(bh);
2252 }
2253
2254 /*
2255  * On entry, the page is fully not uptodate.
2256  * On exit the page is fully uptodate in the areas outside (from,to)
2257  */
2258 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2259                         get_block_t *get_block)
2260 {
2261         struct inode *inode = page->mapping->host;
2262         const unsigned blkbits = inode->i_blkbits;
2263         const unsigned blocksize = 1 << blkbits;
2264         struct buffer_head map_bh;
2265         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2266         unsigned block_in_page;
2267         unsigned block_start;
2268         sector_t block_in_file;
2269         char *kaddr;
2270         int nr_reads = 0;
2271         int i;
2272         int ret = 0;
2273         int is_mapped_to_disk = 1;
2274
2275         if (PageMappedToDisk(page))
2276                 return 0;
2277
2278         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2279         map_bh.b_page = page;
2280
2281         /*
2282          * We loop across all blocks in the page, whether or not they are
2283          * part of the affected region.  This is so we can discover if the
2284          * page is fully mapped-to-disk.
2285          */
2286         for (block_start = 0, block_in_page = 0;
2287                   block_start < PAGE_CACHE_SIZE;
2288                   block_in_page++, block_start += blocksize) {
2289                 unsigned block_end = block_start + blocksize;
2290                 int create;
2291
2292                 map_bh.b_state = 0;
2293                 create = 1;
2294                 if (block_start >= to)
2295                         create = 0;
2296                 map_bh.b_size = blocksize;
2297                 ret = get_block(inode, block_in_file + block_in_page,
2298                                         &map_bh, create);
2299                 if (ret)
2300                         goto failed;
2301                 if (!buffer_mapped(&map_bh))
2302                         is_mapped_to_disk = 0;
2303                 if (buffer_new(&map_bh))
2304                         unmap_underlying_metadata(map_bh.b_bdev,
2305                                                         map_bh.b_blocknr);
2306                 if (PageUptodate(page))
2307                         continue;
2308                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2309                         kaddr = kmap_atomic(page, KM_USER0);
2310                         if (block_start < from)
2311                                 memset(kaddr+block_start, 0, from-block_start);
2312                         if (block_end > to)
2313                                 memset(kaddr + to, 0, block_end - to);
2314                         flush_dcache_page(page);
2315                         kunmap_atomic(kaddr, KM_USER0);
2316                         continue;
2317                 }
2318                 if (buffer_uptodate(&map_bh))
2319                         continue;       /* reiserfs does this */
2320                 if (block_start < from || block_end > to) {
2321                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2322
2323                         if (!bh) {
2324                                 ret = -ENOMEM;
2325                                 goto failed;
2326                         }
2327                         bh->b_state = map_bh.b_state;
2328                         atomic_set(&bh->b_count, 0);
2329                         bh->b_this_page = NULL;
2330                         bh->b_page = page;
2331                         bh->b_blocknr = map_bh.b_blocknr;
2332                         bh->b_size = blocksize;
2333                         bh->b_data = (char *)(long)block_start;
2334                         bh->b_bdev = map_bh.b_bdev;
2335                         bh->b_private = NULL;
2336                         read_bh[nr_reads++] = bh;
2337                 }
2338         }
2339
2340         if (nr_reads) {
2341                 struct buffer_head *bh;
2342
2343                 /*
2344                  * The page is locked, so these buffers are protected from
2345                  * any VM or truncate activity.  Hence we don't need to care
2346                  * for the buffer_head refcounts.
2347                  */
2348                 for (i = 0; i < nr_reads; i++) {
2349                         bh = read_bh[i];
2350                         lock_buffer(bh);
2351                         bh->b_end_io = end_buffer_read_nobh;
2352                         submit_bh(READ, bh);
2353                 }
2354                 for (i = 0; i < nr_reads; i++) {
2355                         bh = read_bh[i];
2356                         wait_on_buffer(bh);
2357                         if (!buffer_uptodate(bh))
2358                                 ret = -EIO;
2359                         free_buffer_head(bh);
2360                         read_bh[i] = NULL;
2361                 }
2362                 if (ret)
2363                         goto failed;
2364         }
2365
2366         if (is_mapped_to_disk)
2367                 SetPageMappedToDisk(page);
2368
2369         return 0;
2370
2371 failed:
2372         for (i = 0; i < nr_reads; i++) {
2373                 if (read_bh[i])
2374                         free_buffer_head(read_bh[i]);
2375         }
2376
2377         /*
2378          * Error recovery is pretty slack.  Clear the page and mark it dirty
2379          * so we'll later zero out any blocks which _were_ allocated.
2380          */
2381         zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
2382         SetPageUptodate(page);
2383         set_page_dirty(page);
2384         return ret;
2385 }
2386 EXPORT_SYMBOL(nobh_prepare_write);
2387
2388 /*
2389  * Make sure any changes to nobh_commit_write() are reflected in
2390  * nobh_truncate_page(), since it doesn't call commit_write().
2391  */
2392 int nobh_commit_write(struct file *file, struct page *page,
2393                 unsigned from, unsigned to)
2394 {
2395         struct inode *inode = page->mapping->host;
2396         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2397
2398         SetPageUptodate(page);
2399         set_page_dirty(page);
2400         if (pos > inode->i_size) {
2401                 i_size_write(inode, pos);
2402                 mark_inode_dirty(inode);
2403         }
2404         return 0;
2405 }
2406 EXPORT_SYMBOL(nobh_commit_write);
2407
2408 /*
2409  * nobh_writepage() - based on block_full_write_page() except
2410  * that it tries to operate without attaching bufferheads to
2411  * the page.
2412  */
2413 int nobh_writepage(struct page *page, get_block_t *get_block,
2414                         struct writeback_control *wbc)
2415 {
2416         struct inode * const inode = page->mapping->host;
2417         loff_t i_size = i_size_read(inode);
2418         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2419         unsigned offset;
2420         int ret;
2421
2422         /* Is the page fully inside i_size? */
2423         if (page->index < end_index)
2424                 goto out;
2425
2426         /* Is the page fully outside i_size? (truncate in progress) */
2427         offset = i_size & (PAGE_CACHE_SIZE-1);
2428         if (page->index >= end_index+1 || !offset) {
2429                 /*
2430                  * The page may have dirty, unmapped buffers.  For example,
2431                  * they may have been added in ext3_writepage().  Make them
2432                  * freeable here, so the page does not leak.
2433                  */
2434 #if 0
2435                 /* Not really sure about this  - do we need this ? */
2436                 if (page->mapping->a_ops->invalidatepage)
2437                         page->mapping->a_ops->invalidatepage(page, offset);
2438 #endif
2439                 unlock_page(page);
2440                 return 0; /* don't care */
2441         }
2442
2443         /*
2444          * The page straddles i_size.  It must be zeroed out on each and every
2445          * writepage invocation because it may be mmapped.  "A file is mapped
2446          * in multiples of the page size.  For a file that is not a multiple of
2447          * the  page size, the remaining memory is zeroed when mapped, and
2448          * writes to that region are not written out to the file."
2449          */
2450         zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2451 out:
2452         ret = mpage_writepage(page, get_block, wbc);
2453         if (ret == -EAGAIN)
2454                 ret = __block_write_full_page(inode, page, get_block, wbc);
2455         return ret;
2456 }
2457 EXPORT_SYMBOL(nobh_writepage);
2458
2459 /*
2460  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2461  */
2462 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2463 {
2464         struct inode *inode = mapping->host;
2465         unsigned blocksize = 1 << inode->i_blkbits;
2466         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2467         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2468         unsigned to;
2469         struct page *page;
2470         const struct address_space_operations *a_ops = mapping->a_ops;
2471         int ret = 0;
2472
2473         if ((offset & (blocksize - 1)) == 0)
2474                 goto out;
2475
2476         ret = -ENOMEM;
2477         page = grab_cache_page(mapping, index);
2478         if (!page)
2479                 goto out;
2480
2481         to = (offset + blocksize) & ~(blocksize - 1);
2482         ret = a_ops->prepare_write(NULL, page, offset, to);
2483         if (ret == 0) {
2484                 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset,
2485                                 KM_USER0);
2486                 /*
2487                  * It would be more correct to call aops->commit_write()
2488                  * here, but this is more efficient.
2489                  */
2490                 SetPageUptodate(page);
2491                 set_page_dirty(page);
2492         }
2493         unlock_page(page);
2494         page_cache_release(page);
2495 out:
2496         return ret;
2497 }
2498 EXPORT_SYMBOL(nobh_truncate_page);
2499
2500 int block_truncate_page(struct address_space *mapping,
2501                         loff_t from, get_block_t *get_block)
2502 {
2503         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2504         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2505         unsigned blocksize;
2506         sector_t iblock;
2507         unsigned length, pos;
2508         struct inode *inode = mapping->host;
2509         struct page *page;
2510         struct buffer_head *bh;
2511         int err;
2512
2513         blocksize = 1 << inode->i_blkbits;
2514         length = offset & (blocksize - 1);
2515
2516         /* Block boundary? Nothing to do */
2517         if (!length)
2518                 return 0;
2519
2520         length = blocksize - length;
2521         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2522
2523         page = grab_cache_page(mapping, index);
2524         err = -ENOMEM;
2525         if (!page)
2526                 goto out;
2527
2528         if (!page_has_buffers(page))
2529                 create_empty_buffers(page, blocksize, 0);
2530
2531         /* Find the buffer that contains "offset" */
2532         bh = page_buffers(page);
2533         pos = blocksize;
2534         while (offset >= pos) {
2535                 bh = bh->b_this_page;
2536                 iblock++;
2537                 pos += blocksize;
2538         }
2539
2540         err = 0;
2541         if (!buffer_mapped(bh)) {
2542                 WARN_ON(bh->b_size != blocksize);
2543                 err = get_block(inode, iblock, bh, 0);
2544                 if (err)
2545                         goto unlock;
2546                 /* unmapped? It's a hole - nothing to do */
2547                 if (!buffer_mapped(bh))
2548                         goto unlock;
2549         }
2550
2551         /* Ok, it's mapped. Make sure it's up-to-date */
2552         if (PageUptodate(page))
2553                 set_buffer_uptodate(bh);
2554
2555         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2556                 err = -EIO;
2557                 ll_rw_block(READ, 1, &bh);
2558                 wait_on_buffer(bh);
2559                 /* Uhhuh. Read error. Complain and punt. */
2560                 if (!buffer_uptodate(bh))
2561                         goto unlock;
2562         }
2563
2564         zero_user_page(page, offset, length, KM_USER0);
2565         mark_buffer_dirty(bh);
2566         err = 0;
2567
2568 unlock:
2569         unlock_page(page);
2570         page_cache_release(page);
2571 out:
2572         return err;
2573 }
2574
2575 /*
2576  * The generic ->writepage function for buffer-backed address_spaces
2577  */
2578 int block_write_full_page(struct page *page, get_block_t *get_block,
2579                         struct writeback_control *wbc)
2580 {
2581         struct inode * const inode = page->mapping->host;
2582         loff_t i_size = i_size_read(inode);
2583         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2584         unsigned offset;
2585
2586         /* Is the page fully inside i_size? */
2587         if (page->index < end_index)
2588                 return __block_write_full_page(inode, page, get_block, wbc);
2589
2590         /* Is the page fully outside i_size? (truncate in progress) */
2591         offset = i_size & (PAGE_CACHE_SIZE-1);
2592         if (page->index >= end_index+1 || !offset) {
2593                 /*
2594                  * The page may have dirty, unmapped buffers.  For example,
2595                  * they may have been added in ext3_writepage().  Make them
2596                  * freeable here, so the page does not leak.
2597                  */
2598                 do_invalidatepage(page, 0);
2599                 unlock_page(page);
2600                 return 0; /* don't care */
2601         }
2602
2603         /*
2604          * The page straddles i_size.  It must be zeroed out on each and every
2605          * writepage invokation because it may be mmapped.  "A file is mapped
2606          * in multiples of the page size.  For a file that is not a multiple of
2607          * the  page size, the remaining memory is zeroed when mapped, and
2608          * writes to that region are not written out to the file."
2609          */
2610         zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0);
2611         return __block_write_full_page(inode, page, get_block, wbc);
2612 }
2613
2614 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2615                             get_block_t *get_block)
2616 {
2617         struct buffer_head tmp;
2618         struct inode *inode = mapping->host;
2619         tmp.b_state = 0;
2620         tmp.b_blocknr = 0;
2621         tmp.b_size = 1 << inode->i_blkbits;
2622         get_block(inode, block, &tmp, 0);
2623         return tmp.b_blocknr;
2624 }
2625
2626 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2627 {
2628         struct buffer_head *bh = bio->bi_private;
2629
2630         if (bio->bi_size)
2631                 return 1;
2632
2633         if (err == -EOPNOTSUPP) {
2634                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2635                 set_bit(BH_Eopnotsupp, &bh->b_state);
2636         }
2637
2638         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2639         bio_put(bio);
2640         return 0;
2641 }
2642
2643 int submit_bh(int rw, struct buffer_head * bh)
2644 {
2645         struct bio *bio;
2646         int ret = 0;
2647
2648         BUG_ON(!buffer_locked(bh));
2649         BUG_ON(!buffer_mapped(bh));
2650         BUG_ON(!bh->b_end_io);
2651
2652         if (buffer_ordered(bh) && (rw == WRITE))
2653                 rw = WRITE_BARRIER;
2654
2655         /*
2656          * Only clear out a write error when rewriting, should this
2657          * include WRITE_SYNC as well?
2658          */
2659         if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2660                 clear_buffer_write_io_error(bh);
2661
2662         /*
2663          * from here on down, it's all bio -- do the initial mapping,
2664          * submit_bio -> generic_make_request may further map this bio around
2665          */
2666         bio = bio_alloc(GFP_NOIO, 1);
2667
2668         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2669         bio->bi_bdev = bh->b_bdev;
2670         bio->bi_io_vec[0].bv_page = bh->b_page;
2671         bio->bi_io_vec[0].bv_len = bh->b_size;
2672         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2673
2674         bio->bi_vcnt = 1;
2675         bio->bi_idx = 0;
2676         bio->bi_size = bh->b_size;
2677
2678         bio->bi_end_io = end_bio_bh_io_sync;
2679         bio->bi_private = bh;
2680
2681         bio_get(bio);
2682         submit_bio(rw, bio);
2683
2684         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2685                 ret = -EOPNOTSUPP;
2686
2687         bio_put(bio);
2688         return ret;
2689 }
2690
2691 /**
2692  * ll_rw_block: low-level access to block devices (DEPRECATED)
2693  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2694  * @nr: number of &struct buffer_heads in the array
2695  * @bhs: array of pointers to &struct buffer_head
2696  *
2697  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2698  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2699  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2700  * are sent to disk. The fourth %READA option is described in the documentation
2701  * for generic_make_request() which ll_rw_block() calls.
2702  *
2703  * This function drops any buffer that it cannot get a lock on (with the
2704  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2705  * clean when doing a write request, and any buffer that appears to be
2706  * up-to-date when doing read request.  Further it marks as clean buffers that
2707  * are processed for writing (the buffer cache won't assume that they are
2708  * actually clean until the buffer gets unlocked).
2709  *
2710  * ll_rw_block sets b_end_io to simple completion handler that marks
2711  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2712  * any waiters.
2713  *
2714  * All of the buffers must be for the same device, and must also be a
2715  * multiple of the current approved size for the device.
2716  */
2717 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2718 {
2719         int i;
2720
2721         for (i = 0; i < nr; i++) {
2722                 struct buffer_head *bh = bhs[i];
2723
2724                 if (rw == SWRITE)
2725                         lock_buffer(bh);
2726                 else if (test_set_buffer_locked(bh))
2727                         continue;
2728
2729                 if (rw == WRITE || rw == SWRITE) {
2730                         if (test_clear_buffer_dirty(bh)) {
2731                                 bh->b_end_io = end_buffer_write_sync;
2732                                 get_bh(bh);
2733                                 submit_bh(WRITE, bh);
2734                                 continue;
2735                         }
2736                 } else {
2737                         if (!buffer_uptodate(bh)) {
2738                                 bh->b_end_io = end_buffer_read_sync;
2739                                 get_bh(bh);
2740                                 submit_bh(rw, bh);
2741                                 continue;
2742                         }
2743                 }
2744                 unlock_buffer(bh);
2745         }
2746 }
2747
2748 /*
2749  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2750  * and then start new I/O and then wait upon it.  The caller must have a ref on
2751  * the buffer_head.
2752  */
2753 int sync_dirty_buffer(struct buffer_head *bh)
2754 {
2755         int ret = 0;
2756
2757         WARN_ON(atomic_read(&bh->b_count) < 1);
2758         lock_buffer(bh);
2759         if (test_clear_buffer_dirty(bh)) {
2760                 get_bh(bh);
2761                 bh->b_end_io = end_buffer_write_sync;
2762                 ret = submit_bh(WRITE, bh);
2763                 wait_on_buffer(bh);
2764                 if (buffer_eopnotsupp(bh)) {
2765                         clear_buffer_eopnotsupp(bh);
2766                         ret = -EOPNOTSUPP;
2767                 }
2768                 if (!ret && !buffer_uptodate(bh))
2769                         ret = -EIO;
2770         } else {
2771                 unlock_buffer(bh);
2772         }
2773         return ret;
2774 }
2775
2776 /*
2777  * try_to_free_buffers() checks if all the buffers on this particular page
2778  * are unused, and releases them if so.
2779  *
2780  * Exclusion against try_to_free_buffers may be obtained by either
2781  * locking the page or by holding its mapping's private_lock.
2782  *
2783  * If the page is dirty but all the buffers are clean then we need to
2784  * be sure to mark the page clean as well.  This is because the page
2785  * may be against a block device, and a later reattachment of buffers
2786  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2787  * filesystem data on the same device.
2788  *
2789  * The same applies to regular filesystem pages: if all the buffers are
2790  * clean then we set the page clean and proceed.  To do that, we require
2791  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2792  * private_lock.
2793  *
2794  * try_to_free_buffers() is non-blocking.
2795  */
2796 static inline int buffer_busy(struct buffer_head *bh)
2797 {
2798         return atomic_read(&bh->b_count) |
2799                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2800 }
2801
2802 static int
2803 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2804 {
2805         struct buffer_head *head = page_buffers(page);
2806         struct buffer_head *bh;
2807
2808         bh = head;
2809         do {
2810                 if (buffer_write_io_error(bh) && page->mapping)
2811                         set_bit(AS_EIO, &page->mapping->flags);
2812                 if (buffer_busy(bh))
2813                         goto failed;
2814                 bh = bh->b_this_page;
2815         } while (bh != head);
2816
2817         do {
2818                 struct buffer_head *next = bh->b_this_page;
2819
2820                 if (!list_empty(&bh->b_assoc_buffers))
2821                         __remove_assoc_queue(bh);
2822                 bh = next;
2823         } while (bh != head);
2824         *buffers_to_free = head;
2825         __clear_page_buffers(page);
2826         return 1;
2827 failed:
2828         return 0;
2829 }
2830
2831 int try_to_free_buffers(struct page *page)
2832 {
2833         struct address_space * const mapping = page->mapping;
2834         struct buffer_head *buffers_to_free = NULL;
2835         int ret = 0;
2836
2837         BUG_ON(!PageLocked(page));
2838         if (PageWriteback(page))
2839                 return 0;
2840
2841         if (mapping == NULL) {          /* can this still happen? */
2842                 ret = drop_buffers(page, &buffers_to_free);
2843                 goto out;
2844         }
2845
2846         spin_lock(&mapping->private_lock);
2847         ret = drop_buffers(page, &buffers_to_free);
2848
2849         /*
2850          * If the filesystem writes its buffers by hand (eg ext3)
2851          * then we can have clean buffers against a dirty page.  We
2852          * clean the page here; otherwise the VM will never notice
2853          * that the filesystem did any IO at all.
2854          *
2855          * Also, during truncate, discard_buffer will have marked all
2856          * the page's buffers clean.  We discover that here and clean
2857          * the page also.
2858          *
2859          * private_lock must be held over this entire operation in order
2860          * to synchronise against __set_page_dirty_buffers and prevent the
2861          * dirty bit from being lost.
2862          */
2863         if (ret)
2864                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
2865         spin_unlock(&mapping->private_lock);
2866 out:
2867         if (buffers_to_free) {
2868                 struct buffer_head *bh = buffers_to_free;
2869
2870                 do {
2871                         struct buffer_head *next = bh->b_this_page;
2872                         free_buffer_head(bh);
2873                         bh = next;
2874                 } while (bh != buffers_to_free);
2875         }
2876         return ret;
2877 }
2878 EXPORT_SYMBOL(try_to_free_buffers);
2879
2880 void block_sync_page(struct page *page)
2881 {
2882         struct address_space *mapping;
2883
2884         smp_mb();
2885         mapping = page_mapping(page);
2886         if (mapping)
2887                 blk_run_backing_dev(mapping->backing_dev_info, page);
2888 }
2889
2890 /*
2891  * There are no bdflush tunables left.  But distributions are
2892  * still running obsolete flush daemons, so we terminate them here.
2893  *
2894  * Use of bdflush() is deprecated and will be removed in a future kernel.
2895  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2896  */
2897 asmlinkage long sys_bdflush(int func, long data)
2898 {
2899         static int msg_count;
2900
2901         if (!capable(CAP_SYS_ADMIN))
2902                 return -EPERM;
2903
2904         if (msg_count < 5) {
2905                 msg_count++;
2906                 printk(KERN_INFO
2907                         "warning: process `%s' used the obsolete bdflush"
2908                         " system call\n", current->comm);
2909                 printk(KERN_INFO "Fix your initscripts?\n");
2910         }
2911
2912         if (func == 1)
2913                 do_exit(0);
2914         return 0;
2915 }
2916
2917 /*
2918  * Buffer-head allocation
2919  */
2920 static struct kmem_cache *bh_cachep;
2921
2922 /*
2923  * Once the number of bh's in the machine exceeds this level, we start
2924  * stripping them in writeback.
2925  */
2926 static int max_buffer_heads;
2927
2928 int buffer_heads_over_limit;
2929
2930 struct bh_accounting {
2931         int nr;                 /* Number of live bh's */
2932         int ratelimit;          /* Limit cacheline bouncing */
2933 };
2934
2935 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2936
2937 static void recalc_bh_state(void)
2938 {
2939         int i;
2940         int tot = 0;
2941
2942         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2943                 return;
2944         __get_cpu_var(bh_accounting).ratelimit = 0;
2945         for_each_online_cpu(i)
2946                 tot += per_cpu(bh_accounting, i).nr;
2947         buffer_heads_over_limit = (tot > max_buffer_heads);
2948 }
2949
2950 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2951 {
2952         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
2953         if (ret) {
2954                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
2955                 get_cpu_var(bh_accounting).nr++;
2956                 recalc_bh_state();
2957                 put_cpu_var(bh_accounting);
2958         }
2959         return ret;
2960 }
2961 EXPORT_SYMBOL(alloc_buffer_head);
2962
2963 void free_buffer_head(struct buffer_head *bh)
2964 {
2965         BUG_ON(!list_empty(&bh->b_assoc_buffers));
2966         kmem_cache_free(bh_cachep, bh);
2967         get_cpu_var(bh_accounting).nr--;
2968         recalc_bh_state();
2969         put_cpu_var(bh_accounting);
2970 }
2971 EXPORT_SYMBOL(free_buffer_head);
2972
2973 static void buffer_exit_cpu(int cpu)
2974 {
2975         int i;
2976         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2977
2978         for (i = 0; i < BH_LRU_SIZE; i++) {
2979                 brelse(b->bhs[i]);
2980                 b->bhs[i] = NULL;
2981         }
2982         get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
2983         per_cpu(bh_accounting, cpu).nr = 0;
2984         put_cpu_var(bh_accounting);
2985 }
2986
2987 static int buffer_cpu_notify(struct notifier_block *self,
2988                               unsigned long action, void *hcpu)
2989 {
2990         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
2991                 buffer_exit_cpu((unsigned long)hcpu);
2992         return NOTIFY_OK;
2993 }
2994
2995 /**
2996  * bh_uptodate_or_lock: Test whether the buffer is uptodate
2997  * @bh: struct buffer_head
2998  *
2999  * Return true if the buffer is up-to-date and false,
3000  * with the buffer locked, if not.
3001  */
3002 int bh_uptodate_or_lock(struct buffer_head *bh)
3003 {
3004         if (!buffer_uptodate(bh)) {
3005                 lock_buffer(bh);
3006                 if (!buffer_uptodate(bh))
3007                         return 0;
3008                 unlock_buffer(bh);
3009         }
3010         return 1;
3011 }
3012 EXPORT_SYMBOL(bh_uptodate_or_lock);
3013
3014 /**
3015  * bh_submit_read: Submit a locked buffer for reading
3016  * @bh: struct buffer_head
3017  *
3018  * Returns zero on success and -EIO on error.
3019  */
3020 int bh_submit_read(struct buffer_head *bh)
3021 {
3022         BUG_ON(!buffer_locked(bh));
3023
3024         if (buffer_uptodate(bh)) {
3025                 unlock_buffer(bh);
3026                 return 0;
3027         }
3028
3029         get_bh(bh);
3030         bh->b_end_io = end_buffer_read_sync;
3031         submit_bh(READ, bh);
3032         wait_on_buffer(bh);
3033         if (buffer_uptodate(bh))
3034                 return 0;
3035         return -EIO;
3036 }
3037 EXPORT_SYMBOL(bh_submit_read);
3038
3039 void __init buffer_init(void)
3040 {
3041         int nrpages;
3042
3043         bh_cachep = KMEM_CACHE(buffer_head,
3044                         SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
3045
3046         /*
3047          * Limit the bh occupancy to 10% of ZONE_NORMAL
3048          */
3049         nrpages = (nr_free_buffer_pages() * 10) / 100;
3050         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3051         hotcpu_notifier(buffer_cpu_notify, 0);
3052 }
3053
3054 EXPORT_SYMBOL(__bforget);
3055 EXPORT_SYMBOL(__brelse);
3056 EXPORT_SYMBOL(__wait_on_buffer);
3057 EXPORT_SYMBOL(block_commit_write);
3058 EXPORT_SYMBOL(block_prepare_write);
3059 EXPORT_SYMBOL(block_read_full_page);
3060 EXPORT_SYMBOL(block_sync_page);
3061 EXPORT_SYMBOL(block_truncate_page);
3062 EXPORT_SYMBOL(block_write_full_page);
3063 EXPORT_SYMBOL(cont_prepare_write);
3064 EXPORT_SYMBOL(end_buffer_read_sync);
3065 EXPORT_SYMBOL(end_buffer_write_sync);
3066 EXPORT_SYMBOL(file_fsync);
3067 EXPORT_SYMBOL(fsync_bdev);
3068 EXPORT_SYMBOL(generic_block_bmap);
3069 EXPORT_SYMBOL(generic_commit_write);
3070 EXPORT_SYMBOL(generic_cont_expand);
3071 EXPORT_SYMBOL(generic_cont_expand_simple);
3072 EXPORT_SYMBOL(init_buffer);
3073 EXPORT_SYMBOL(invalidate_bdev);
3074 EXPORT_SYMBOL(ll_rw_block);
3075 EXPORT_SYMBOL(mark_buffer_dirty);
3076 EXPORT_SYMBOL(submit_bh);
3077 EXPORT_SYMBOL(sync_dirty_buffer);
3078 EXPORT_SYMBOL(unlock_buffer);