fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5  */
   6
   7 /*
   8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9  *
  10  * Removed a lot of unnecessary code and simplified things now that
  11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12  *
  13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15  *
  16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17  *
  18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19  */
  20
  21 #include <linux/kernel.h>
  22 #include <linux/syscalls.h>
  23 #include <linux/fs.h>
  24 #include <linux/mm.h>
  25 #include <linux/percpu.h>
  26 #include <linux/slab.h>
  27 #include <linux/capability.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/file.h>
  30 #include <linux/quotaops.h>
  31 #include <linux/highmem.h>
  32 #include <linux/module.h>
  33 #include <linux/writeback.h>
  34 #include <linux/hash.h>
  35 #include <linux/suspend.h>
  36 #include <linux/buffer_head.h>
  37 #include <linux/task_io_accounting_ops.h>
  38 #include <linux/bio.h>
  39 #include <linux/notifier.h>
  40 #include <linux/cpu.h>
  41 #include <linux/bitops.h>
  42 #include <linux/mpage.h>
  43 #include <linux/bit_spinlock.h>
  44
  45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  46
  47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  48
  49 inline void
  50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  51 {
  52         bh->b_end_io = handler;
  53         bh->b_private = private;
  54 }
  55 EXPORT_SYMBOL(init_buffer);
  56
  57 static int sync_buffer(void *word)
  58 {
  59         struct block_device *bd;
  60         struct buffer_head *bh
  61                 = container_of(word, struct buffer_head, b_state);
  62
  63         smp_mb();
  64         bd = bh->b_bdev;
  65         if (bd)
  66                 blk_run_address_space(bd->bd_inode->i_mapping);
  67         io_schedule();
  68         return 0;
  69 }
  70
  71 void __lock_buffer(struct buffer_head *bh)
  72 {
  73         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
  74                                                         TASK_UNINTERRUPTIBLE);
  75 }
  76 EXPORT_SYMBOL(__lock_buffer);
  77
  78 void unlock_buffer(struct buffer_head *bh)
  79 {
  80         clear_bit_unlock(BH_Lock, &bh->b_state);
  81         smp_mb__after_clear_bit();
  82         wake_up_bit(&bh->b_state, BH_Lock);
  83 }
  84 EXPORT_SYMBOL(unlock_buffer);
  85
  86 /*
  87  * Block until a buffer comes unlocked.  This doesn't stop it
  88  * from becoming locked again - you have to lock it yourself
  89  * if you want to preserve its state.
  90  */
  91 void __wait_on_buffer(struct buffer_head * bh)
  92 {
  93         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
  94 }
  95 EXPORT_SYMBOL(__wait_on_buffer);
  96
  97 static void
  98 __clear_page_buffers(struct page *page)
  99 {
 100         ClearPagePrivate(page);
 101         set_page_private(page, 0);
 102         page_cache_release(page);
 103 }
 104
 105
 106 static int quiet_error(struct buffer_head *bh)
 107 {
 108         if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
 109                 return 0;
 110         return 1;
 111 }
 112
 113
 114 static void buffer_io_error(struct buffer_head *bh)
 115 {
 116         char b[BDEVNAME_SIZE];
 117         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 118                         bdevname(bh->b_bdev, b),
 119                         (unsigned long long)bh->b_blocknr);
 120 }
 121
 122 /*
 123  * End-of-IO handler helper function which does not touch the bh after
 124  * unlocking it.
 125  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 126  * a race there is benign: unlock_buffer() only use the bh's address for
 127  * hashing after unlocking the buffer, so it doesn't actually touch the bh
 128  * itself.
 129  */
 130 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
 131 {
 132         if (uptodate) {
 133                 set_buffer_uptodate(bh);
 134         } else {
 135                 /* This happens, due to failed READA attempts. */
 136                 clear_buffer_uptodate(bh);
 137         }
 138         unlock_buffer(bh);
 139 }
 140
 141 /*
 142  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 143  * unlock the buffer. This is what ll_rw_block uses too.
 144  */
 145 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
 146 {
 147         __end_buffer_read_notouch(bh, uptodate);
 148         put_bh(bh);
 149 }
 150 EXPORT_SYMBOL(end_buffer_read_sync);
 151
 152 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 153 {
 154         char b[BDEVNAME_SIZE];
 155
 156         if (uptodate) {
 157                 set_buffer_uptodate(bh);
 158         } else {
 159                 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
 160                         buffer_io_error(bh);
 161                         printk(KERN_WARNING "lost page write due to "
 162                                         "I/O error on %s\n",
 163                                        bdevname(bh->b_bdev, b));
 164                 }
 165                 set_buffer_write_io_error(bh);
 166                 clear_buffer_uptodate(bh);
 167         }
 168         unlock_buffer(bh);
 169         put_bh(bh);
 170 }
 171 EXPORT_SYMBOL(end_buffer_write_sync);
 172
 173 /*
 174  * Various filesystems appear to want __find_get_block to be non-blocking.
 175  * But it's the page lock which protects the buffers.  To get around this,
 176  * we get exclusion from try_to_free_buffers with the blockdev mapping's
 177  * private_lock.
 178  *
 179  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 180  * may be quite high.  This code could TryLock the page, and if that
 181  * succeeds, there is no need to take private_lock. (But if
 182  * private_lock is contended then so is mapping->tree_lock).
 183  */
 184 static struct buffer_head *
 185 __find_get_block_slow(struct block_device *bdev, sector_t block)
 186 {
 187         struct inode *bd_inode = bdev->bd_inode;
 188         struct address_space *bd_mapping = bd_inode->i_mapping;
 189         struct buffer_head *ret = NULL;
 190         pgoff_t index;
 191         struct buffer_head *bh;
 192         struct buffer_head *head;
 193         struct page *page;
 194         int all_mapped = 1;
 195
 196         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 197         page = find_get_page(bd_mapping, index);
 198         if (!page)
 199                 goto out;
 200
 201         spin_lock(&bd_mapping->private_lock);
 202         if (!page_has_buffers(page))
 203                 goto out_unlock;
 204         head = page_buffers(page);
 205         bh = head;
 206         do {
 207                 if (!buffer_mapped(bh))
 208                         all_mapped = 0;
 209                 else if (bh->b_blocknr == block) {
 210                         ret = bh;
 211                         get_bh(bh);
 212                         goto out_unlock;
 213                 }
 214                 bh = bh->b_this_page;
 215         } while (bh != head);
 216
 217         /* we might be here because some of the buffers on this page are
 218          * not mapped.  This is due to various races between
 219          * file io on the block device and getblk.  It gets dealt with
 220          * elsewhere, don't buffer_error if we had some unmapped buffers
 221          */
 222         if (all_mapped) {
 223                 printk("__find_get_block_slow() failed. "
 224                         "block=%llu, b_blocknr=%llu\n",
 225                         (unsigned long long)block,
 226                         (unsigned long long)bh->b_blocknr);
 227                 printk("b_state=0x%08lx, b_size=%zu\n",
 228                         bh->b_state, bh->b_size);
 229                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
 230         }
 231 out_unlock:
 232         spin_unlock(&bd_mapping->private_lock);
 233         page_cache_release(page);
 234 out:
 235         return ret;
 236 }
 237
 238 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 239    of fs corruption is going on. Trashing dirty data always imply losing
 240    information that was supposed to be just stored on the physical layer
 241    by the user.
 242
 243    Thus invalidate_buffers in general usage is not allwowed to trash
 244    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 245    be preserved.  These buffers are simply skipped.
 246
 247    We also skip buffers which are still in use.  For example this can
 248    happen if a userspace program is reading the block device.
 249
 250    NOTE: In the case where the user removed a removable-media-disk even if
 251    there's still dirty data not synced on disk (due a bug in the device driver
 252    or due an error of the user), by not destroying the dirty buffers we could
 253    generate corruption also on the next media inserted, thus a parameter is
 254    necessary to handle this case in the most safe way possible (trying
 255    to not corrupt also the new disk inserted with the data belonging to
 256    the old now corrupted disk). Also for the ramdisk the natural thing
 257    to do in order to release the ramdisk memory is to destroy dirty buffers.
 258
 259    These are two special cases. Normal usage imply the device driver
 260    to issue a sync on the device (without waiting I/O completion) and
 261    then an invalidate_buffers call that doesn't trash dirty buffers.
 262
 263    For handling cache coherency with the blkdev pagecache the 'update' case
 264    is been introduced. It is needed to re-read from disk any pinned
 265    buffer. NOTE: re-reading from disk is destructive so we can do it only
 266    when we assume nobody is changing the buffercache under our I/O and when
 267    we think the disk contains more recent information than the buffercache.
 268    The update == 1 pass marks the buffers we need to update, the update == 2
 269    pass does the actual I/O. */
 270 void invalidate_bdev(struct block_device *bdev)
 271 {
 272         struct address_space *mapping = bdev->bd_inode->i_mapping;
 273
 274         if (mapping->nrpages == 0)
 275                 return;
 276
 277         invalidate_bh_lrus();
 278         invalidate_mapping_pages(mapping, 0, -1);
 279 }
 280 EXPORT_SYMBOL(invalidate_bdev);
 281
 282 /*
 283  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 284  */
 285 static void free_more_memory(void)
 286 {
 287         struct zone *zone;
 288         int nid;
 289
 290         wakeup_flusher_threads(1024);
 291         yield();
 292
 293         for_each_online_node(nid) {
 294                 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
 295                                                 gfp_zone(GFP_NOFS), NULL,
 296                                                 &zone);
 297                 if (zone)
 298                         try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 299                                                 GFP_NOFS, NULL);
 300         }
 301 }
 302
 303 /*
 304  * I/O completion handler for block_read_full_page() - pages
 305  * which come unlocked at the end of I/O.
 306  */
 307 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 308 {
 309         unsigned long flags;
 310         struct buffer_head *first;
 311         struct buffer_head *tmp;
 312         struct page *page;
 313         int page_uptodate = 1;
 314
 315         BUG_ON(!buffer_async_read(bh));
 316
 317         page = bh->b_page;
 318         if (uptodate) {
 319                 set_buffer_uptodate(bh);
 320         } else {
 321                 clear_buffer_uptodate(bh);
 322                 if (!quiet_error(bh))
 323                         buffer_io_error(bh);
 324                 SetPageError(page);
 325         }
 326
 327         /*
 328          * Be _very_ careful from here on. Bad things can happen if
 329          * two buffer heads end IO at almost the same time and both
 330          * decide that the page is now completely done.
 331          */
 332         first = page_buffers(page);
 333         local_irq_save(flags);
 334         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 335         clear_buffer_async_read(bh);
 336         unlock_buffer(bh);
 337         tmp = bh;
 338         do {
 339                 if (!buffer_uptodate(tmp))
 340                         page_uptodate = 0;
 341                 if (buffer_async_read(tmp)) {
 342                         BUG_ON(!buffer_locked(tmp));
 343                         goto still_busy;
 344                 }
 345                 tmp = tmp->b_this_page;
 346         } while (tmp != bh);
 347         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 348         local_irq_restore(flags);
 349
 350         /*
 351          * If none of the buffers had errors and they are all
 352          * uptodate then we can set the page uptodate.
 353          */
 354         if (page_uptodate && !PageError(page))
 355                 SetPageUptodate(page);
 356         unlock_page(page);
 357         return;
 358
 359 still_busy:
 360         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 361         local_irq_restore(flags);
 362         return;
 363 }
 364
 365 /*
 366  * Completion handler for block_write_full_page() - pages which are unlocked
 367  * during I/O, and which have PageWriteback cleared upon I/O completion.
 368  */
 369 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 370 {
 371         char b[BDEVNAME_SIZE];
 372         unsigned long flags;
 373         struct buffer_head *first;
 374         struct buffer_head *tmp;
 375         struct page *page;
 376
 377         BUG_ON(!buffer_async_write(bh));
 378
 379         page = bh->b_page;
 380         if (uptodate) {
 381                 set_buffer_uptodate(bh);
 382         } else {
 383                 if (!quiet_error(bh)) {
 384                         buffer_io_error(bh);
 385                         printk(KERN_WARNING "lost page write due to "
 386                                         "I/O error on %s\n",
 387                                bdevname(bh->b_bdev, b));
 388                 }
 389                 set_bit(AS_EIO, &page->mapping->flags);
 390                 set_buffer_write_io_error(bh);
 391                 clear_buffer_uptodate(bh);
 392                 SetPageError(page);
 393         }
 394
 395         first = page_buffers(page);
 396         local_irq_save(flags);
 397         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 398
 399         clear_buffer_async_write(bh);
 400         unlock_buffer(bh);
 401         tmp = bh->b_this_page;
 402         while (tmp != bh) {
 403                 if (buffer_async_write(tmp)) {
 404                         BUG_ON(!buffer_locked(tmp));
 405                         goto still_busy;
 406                 }
 407                 tmp = tmp->b_this_page;
 408         }
 409         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 410         local_irq_restore(flags);
 411         end_page_writeback(page);
 412         return;
 413
 414 still_busy:
 415         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
 416         local_irq_restore(flags);
 417         return;
 418 }
 419 EXPORT_SYMBOL(end_buffer_async_write);
 420
 421 /*
 422  * If a page's buffers are under async readin (end_buffer_async_read
 423  * completion) then there is a possibility that another thread of
 424  * control could lock one of the buffers after it has completed
 425  * but while some of the other buffers have not completed.  This
 426  * locked buffer would confuse end_buffer_async_read() into not unlocking
 427  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 428  * that this buffer is not under async I/O.
 429  *
 430  * The page comes unlocked when it has no locked buffer_async buffers
 431  * left.
 432  *
 433  * PageLocked prevents anyone starting new async I/O reads any of
 434  * the buffers.
 435  *
 436  * PageWriteback is used to prevent simultaneous writeout of the same
 437  * page.
 438  *
 439  * PageLocked prevents anyone from starting writeback of a page which is
 440  * under read I/O (PageWriteback is only ever set against a locked page).
 441  */
 442 static void mark_buffer_async_read(struct buffer_head *bh)
 443 {
 444         bh->b_end_io = end_buffer_async_read;
 445         set_buffer_async_read(bh);
 446 }
 447
 448 static void mark_buffer_async_write_endio(struct buffer_head *bh,
 449                                           bh_end_io_t *handler)
 450 {
 451         bh->b_end_io = handler;
 452         set_buffer_async_write(bh);
 453 }
 454
 455 void mark_buffer_async_write(struct buffer_head *bh)
 456 {
 457         mark_buffer_async_write_endio(bh, end_buffer_async_write);
 458 }
 459 EXPORT_SYMBOL(mark_buffer_async_write);
 460
 461
 462 /*
 463  * fs/buffer.c contains helper functions for buffer-backed address space's
 464  * fsync functions.  A common requirement for buffer-based filesystems is
 465  * that certain data from the backing blockdev needs to be written out for
 466  * a successful fsync().  For example, ext2 indirect blocks need to be
 467  * written back and waited upon before fsync() returns.
 468  *
 469  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 470  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 471  * management of a list of dependent buffers at ->i_mapping->private_list.
 472  *
 473  * Locking is a little subtle: try_to_free_buffers() will remove buffers
 474  * from their controlling inode's queue when they are being freed.  But
 475  * try_to_free_buffers() will be operating against the *blockdev* mapping
 476  * at the time, not against the S_ISREG file which depends on those buffers.
 477  * So the locking for private_list is via the private_lock in the address_space
 478  * which backs the buffers.  Which is different from the address_space
 479  * against which the buffers are listed.  So for a particular address_space,
 480  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 481  * mapping->private_list will always be protected by the backing blockdev's
 482  * ->private_lock.
 483  *
 484  * Which introduces a requirement: all buffers on an address_space's
 485  * ->private_list must be from the same address_space: the blockdev's.
 486  *
 487  * address_spaces which do not place buffers at ->private_list via these
 488  * utility functions are free to use private_lock and private_list for
 489  * whatever they want.  The only requirement is that list_empty(private_list)
 490  * be true at clear_inode() time.
 491  *
 492  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 493  * filesystems should do that.  invalidate_inode_buffers() should just go
 494  * BUG_ON(!list_empty).
 495  *
 496  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 497  * take an address_space, not an inode.  And it should be called
 498  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 499  * queued up.
 500  *
 501  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 502  * list if it is already on a list.  Because if the buffer is on a list,
 503  * it *must* already be on the right one.  If not, the filesystem is being
 504  * silly.  This will save a ton of locking.  But first we have to ensure
 505  * that buffers are taken *off* the old inode's list when they are freed
 506  * (presumably in truncate).  That requires careful auditing of all
 507  * filesystems (do it inside bforget()).  It could also be done by bringing
 508  * b_inode back.
 509  */
 510
 511 /*
 512  * The buffer's backing address_space's private_lock must be held
 513  */
 514 static void __remove_assoc_queue(struct buffer_head *bh)
 515 {
 516         list_del_init(&bh->b_assoc_buffers);
 517         WARN_ON(!bh->b_assoc_map);
 518         if (buffer_write_io_error(bh))
 519                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
 520         bh->b_assoc_map = NULL;
 521 }
 522
 523 int inode_has_buffers(struct inode *inode)
 524 {
 525         return !list_empty(&inode->i_data.private_list);
 526 }
 527
 528 /*
 529  * osync is designed to support O_SYNC io.  It waits synchronously for
 530  * all already-submitted IO to complete, but does not queue any new
 531  * writes to the disk.
 532  *
 533  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 534  * you dirty the buffers, and then use osync_inode_buffers to wait for
 535  * completion.  Any other dirty buffers which are not yet queued for
 536  * write will not be flushed to disk by the osync.
 537  */
 538 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 539 {
 540         struct buffer_head *bh;
 541         struct list_head *p;
 542         int err = 0;
 543
 544         spin_lock(lock);
 545 repeat:
 546         list_for_each_prev(p, list) {
 547                 bh = BH_ENTRY(p);
 548                 if (buffer_locked(bh)) {
 549                         get_bh(bh);
 550                         spin_unlock(lock);
 551                         wait_on_buffer(bh);
 552                         if (!buffer_uptodate(bh))
 553                                 err = -EIO;
 554                         brelse(bh);
 555                         spin_lock(lock);
 556                         goto repeat;
 557                 }
 558         }
 559         spin_unlock(lock);
 560         return err;
 561 }
 562
 563 static void do_thaw_one(struct super_block *sb, void *unused)
 564 {
 565         char b[BDEVNAME_SIZE];
 566         while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 567                 printk(KERN_WARNING "Emergency Thaw on %s\n",
 568                        bdevname(sb->s_bdev, b));
 569 }
 570
 571 static void do_thaw_all(struct work_struct *work)
 572 {
 573         iterate_supers(do_thaw_one, NULL);
 574         kfree(work);
 575         printk(KERN_WARNING "Emergency Thaw complete\n");
 576 }
 577
 578 /**
 579  * emergency_thaw_all -- forcibly thaw every frozen filesystem
 580  *
 581  * Used for emergency unfreeze of all filesystems via SysRq
 582  */
 583 void emergency_thaw_all(void)
 584 {
 585         struct work_struct *work;
 586
 587         work = kmalloc(sizeof(*work), GFP_ATOMIC);
 588         if (work) {
 589                 INIT_WORK(work, do_thaw_all);
 590                 schedule_work(work);
 591         }
 592 }
 593
 594 /**
 595  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 596  * @mapping: the mapping which wants those buffers written
 597  *
 598  * Starts I/O against the buffers at mapping->private_list, and waits upon
 599  * that I/O.
 600  *
 601  * Basically, this is a convenience function for fsync().
 602  * @mapping is a file or directory which needs those buffers to be written for
 603  * a successful fsync().
 604  */
 605 int sync_mapping_buffers(struct address_space *mapping)
 606 {
 607         struct address_space *buffer_mapping = mapping->assoc_mapping;
 608
 609         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 610                 return 0;
 611
 612         return fsync_buffers_list(&buffer_mapping->private_lock,
 613                                         &mapping->private_list);
 614 }
 615 EXPORT_SYMBOL(sync_mapping_buffers);
 616
 617 /*
 618  * Called when we've recently written block `bblock', and it is known that
 619  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 620  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 621  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 622  */
 623 void write_boundary_block(struct block_device *bdev,
 624                         sector_t bblock, unsigned blocksize)
 625 {
 626         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 627         if (bh) {
 628                 if (buffer_dirty(bh))
 629                         ll_rw_block(WRITE, 1, &bh);
 630                 put_bh(bh);
 631         }
 632 }
 633
 634 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 635 {
 636         struct address_space *mapping = inode->i_mapping;
 637         struct address_space *buffer_mapping = bh->b_page->mapping;
 638
 639         mark_buffer_dirty(bh);
 640         if (!mapping->assoc_mapping) {
 641                 mapping->assoc_mapping = buffer_mapping;
 642         } else {
 643                 BUG_ON(mapping->assoc_mapping != buffer_mapping);
 644         }
 645         if (!bh->b_assoc_map) {
 646                 spin_lock(&buffer_mapping->private_lock);
 647                 list_move_tail(&bh->b_assoc_buffers,
 648                                 &mapping->private_list);
 649                 bh->b_assoc_map = mapping;
 650                 spin_unlock(&buffer_mapping->private_lock);
 651         }
 652 }
 653 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 654
 655 /*
 656  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
 657  * dirty.
 658  *
 659  * If warn is true, then emit a warning if the page is not uptodate and has
 660  * not been truncated.
 661  */
 662 static void __set_page_dirty(struct page *page,
 663                 struct address_space *mapping, int warn)
 664 {
 665         spin_lock_irq(&mapping->tree_lock);
 666         if (page->mapping) {    /* Race with truncate? */
 667                 WARN_ON_ONCE(warn && !PageUptodate(page));
 668                 account_page_dirtied(page, mapping);
 669                 radix_tree_tag_set(&mapping->page_tree,
 670                                 page_index(page), PAGECACHE_TAG_DIRTY);
 671         }
 672         spin_unlock_irq(&mapping->tree_lock);
 673         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 674 }
 675
 676 /*
 677  * Add a page to the dirty page list.
 678  *
 679  * It is a sad fact of life that this function is called from several places
 680  * deeply under spinlocking.  It may not sleep.
 681  *
 682  * If the page has buffers, the uptodate buffers are set dirty, to preserve
 683  * dirty-state coherency between the page and the buffers.  It the page does
 684  * not have buffers then when they are later attached they will all be set
 685  * dirty.
 686  *
 687  * The buffers are dirtied before the page is dirtied.  There's a small race
 688  * window in which a writepage caller may see the page cleanness but not the
 689  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 690  * before the buffers, a concurrent writepage caller could clear the page dirty
 691  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 692  * page on the dirty page list.
 693  *
 694  * We use private_lock to lock against try_to_free_buffers while using the
 695  * page's buffer list.  Also use this to protect against clean buffers being
 696  * added to the page after it was set dirty.
 697  *
 698  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 699  * address_space though.
 700  */
 701 int __set_page_dirty_buffers(struct page *page)
 702 {
 703         int newly_dirty;
 704         struct address_space *mapping = page_mapping(page);
 705
 706         if (unlikely(!mapping))
 707                 return !TestSetPageDirty(page);
 708
 709         spin_lock(&mapping->private_lock);
 710         if (page_has_buffers(page)) {
 711                 struct buffer_head *head = page_buffers(page);
 712                 struct buffer_head *bh = head;
 713
 714                 do {
 715                         set_buffer_dirty(bh);
 716                         bh = bh->b_this_page;
 717                 } while (bh != head);
 718         }
 719         newly_dirty = !TestSetPageDirty(page);
 720         spin_unlock(&mapping->private_lock);
 721
 722         if (newly_dirty)
 723                 __set_page_dirty(page, mapping, 1);
 724         return newly_dirty;
 725 }
 726 EXPORT_SYMBOL(__set_page_dirty_buffers);
 727
 728 /*
 729  * Write out and wait upon a list of buffers.
 730  *
 731  * We have conflicting pressures: we want to make sure that all
 732  * initially dirty buffers get waited on, but that any subsequently
 733  * dirtied buffers don't.  After all, we don't want fsync to last
 734  * forever if somebody is actively writing to the file.
 735  *
 736  * Do this in two main stages: first we copy dirty buffers to a
 737  * temporary inode list, queueing the writes as we go.  Then we clean
 738  * up, waiting for those writes to complete.
 739  *
 740  * During this second stage, any subsequent updates to the file may end
 741  * up refiling the buffer on the original inode's dirty list again, so
 742  * there is a chance we will end up with a buffer queued for write but
 743  * not yet completed on that list.  So, as a final cleanup we go through
 744  * the osync code to catch these locked, dirty buffers without requeuing
 745  * any newly dirty buffers for write.
 746  */
 747 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 748 {
 749         struct buffer_head *bh;
 750         struct list_head tmp;
 751         struct address_space *mapping, *prev_mapping = NULL;
 752         int err = 0, err2;
 753
 754         INIT_LIST_HEAD(&tmp);
 755
 756         spin_lock(lock);
 757         while (!list_empty(list)) {
 758                 bh = BH_ENTRY(list->next);
 759                 mapping = bh->b_assoc_map;
 760                 __remove_assoc_queue(bh);
 761                 /* Avoid race with mark_buffer_dirty_inode() which does
 762                  * a lockless check and we rely on seeing the dirty bit */
 763                 smp_mb();
 764                 if (buffer_dirty(bh) || buffer_locked(bh)) {
 765                         list_add(&bh->b_assoc_buffers, &tmp);
 766                         bh->b_assoc_map = mapping;
 767                         if (buffer_dirty(bh)) {
 768                                 get_bh(bh);
 769                                 spin_unlock(lock);
 770                                 /*
 771                                  * Ensure any pending I/O completes so that
 772                                  * ll_rw_block() actually writes the current
 773                                  * contents - it is a noop if I/O is still in
 774                                  * flight on potentially older contents.
 775                                  */
 776                                 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
 777
 778                                 /*
 779                                  * Kick off IO for the previous mapping. Note
 780                                  * that we will not run the very last mapping,
 781                                  * wait_on_buffer() will do that for us
 782                                  * through sync_buffer().
 783                                  */
 784                                 if (prev_mapping && prev_mapping != mapping)
 785                                         blk_run_address_space(prev_mapping);
 786                                 prev_mapping = mapping;
 787
 788                                 brelse(bh);
 789                                 spin_lock(lock);
 790                         }
 791                 }
 792         }
 793
 794         while (!list_empty(&tmp)) {
 795                 bh = BH_ENTRY(tmp.prev);
 796                 get_bh(bh);
 797                 mapping = bh->b_assoc_map;
 798                 __remove_assoc_queue(bh);
 799                 /* Avoid race with mark_buffer_dirty_inode() which does
 800                  * a lockless check and we rely on seeing the dirty bit */
 801                 smp_mb();
 802                 if (buffer_dirty(bh)) {
 803                         list_add(&bh->b_assoc_buffers,
 804                                  &mapping->private_list);
 805                         bh->b_assoc_map = mapping;
 806                 }
 807                 spin_unlock(lock);
 808                 wait_on_buffer(bh);
 809                 if (!buffer_uptodate(bh))
 810                         err = -EIO;
 811                 brelse(bh);
 812                 spin_lock(lock);
 813         }
 814
 815         spin_unlock(lock);
 816         err2 = osync_buffers_list(lock, list);
 817         if (err)
 818                 return err;
 819         else
 820                 return err2;
 821 }
 822
 823 /*
 824  * Invalidate any and all dirty buffers on a given inode.  We are
 825  * probably unmounting the fs, but that doesn't mean we have already
 826  * done a sync().  Just drop the buffers from the inode list.
 827  *
 828  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 829  * assumes that all the buffers are against the blockdev.  Not true
 830  * for reiserfs.
 831  */
 832 void invalidate_inode_buffers(struct inode *inode)
 833 {
 834         if (inode_has_buffers(inode)) {
 835                 struct address_space *mapping = &inode->i_data;
 836                 struct list_head *list = &mapping->private_list;
 837                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 838
 839                 spin_lock(&buffer_mapping->private_lock);
 840                 while (!list_empty(list))
 841                         __remove_assoc_queue(BH_ENTRY(list->next));
 842                 spin_unlock(&buffer_mapping->private_lock);
 843         }
 844 }
 845 EXPORT_SYMBOL(invalidate_inode_buffers);
 846
 847 /*
 848  * Remove any clean buffers from the inode's buffer list.  This is called
 849  * when we're trying to free the inode itself.  Those buffers can pin it.
 850  *
 851  * Returns true if all buffers were removed.
 852  */
 853 int remove_inode_buffers(struct inode *inode)
 854 {
 855         int ret = 1;
 856
 857         if (inode_has_buffers(inode)) {
 858                 struct address_space *mapping = &inode->i_data;
 859                 struct list_head *list = &mapping->private_list;
 860                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 861
 862                 spin_lock(&buffer_mapping->private_lock);
 863                 while (!list_empty(list)) {
 864                         struct buffer_head *bh = BH_ENTRY(list->next);
 865                         if (buffer_dirty(bh)) {
 866                                 ret = 0;
 867                                 break;
 868                         }
 869                         __remove_assoc_queue(bh);
 870                 }
 871                 spin_unlock(&buffer_mapping->private_lock);
 872         }
 873         return ret;
 874 }
 875
 876 /*
 877  * Create the appropriate buffers when given a page for data area and
 878  * the size of each buffer.. Use the bh->b_this_page linked list to
 879  * follow the buffers created.  Return NULL if unable to create more
 880  * buffers.
 881  *
 882  * The retry flag is used to differentiate async IO (paging, swapping)
 883  * which may not fail from ordinary buffer allocations.
 884  */
 885 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
 886                 int retry)
 887 {
 888         struct buffer_head *bh, *head;
 889         long offset;
 890
 891 try_again:
 892         head = NULL;
 893         offset = PAGE_SIZE;
 894         while ((offset -= size) >= 0) {
 895                 bh = alloc_buffer_head(GFP_NOFS);
 896                 if (!bh)
 897                         goto no_grow;
 898
 899                 bh->b_bdev = NULL;
 900                 bh->b_this_page = head;
 901                 bh->b_blocknr = -1;
 902                 head = bh;
 903
 904                 bh->b_state = 0;
 905                 atomic_set(&bh->b_count, 0);
 906                 bh->b_private = NULL;
 907                 bh->b_size = size;
 908
 909                 /* Link the buffer to its page */
 910                 set_bh_page(bh, page, offset);
 911
 912                 init_buffer(bh, NULL, NULL);
 913         }
 914         return head;
 915 /*
 916  * In case anything failed, we just free everything we got.
 917  */
 918 no_grow:
 919         if (head) {
 920                 do {
 921                         bh = head;
 922                         head = head->b_this_page;
 923                         free_buffer_head(bh);
 924                 } while (head);
 925         }
 926
 927         /*
 928          * Return failure for non-async IO requests.  Async IO requests
 929          * are not allowed to fail, so we have to wait until buffer heads
 930          * become available.  But we don't want tasks sleeping with
 931          * partially complete buffers, so all were released above.
 932          */
 933         if (!retry)
 934                 return NULL;
 935
 936         /* We're _really_ low on memory. Now we just
 937          * wait for old buffer heads to become free due to
 938          * finishing IO.  Since this is an async request and
 939          * the reserve list is empty, we're sure there are
 940          * async buffer heads in use.
 941          */
 942         free_more_memory();
 943         goto try_again;
 944 }
 945 EXPORT_SYMBOL_GPL(alloc_page_buffers);
 946
 947 static inline void
 948 link_dev_buffers(struct page *page, struct buffer_head *head)
 949 {
 950         struct buffer_head *bh, *tail;
 951
 952         bh = head;
 953         do {
 954                 tail = bh;
 955                 bh = bh->b_this_page;
 956         } while (bh);
 957         tail->b_this_page = head;
 958         attach_page_buffers(page, head);
 959 }
 960
 961 /*
 962  * Initialise the state of a blockdev page's buffers.
 963  */
 964 static void
 965 init_page_buffers(struct page *page, struct block_device *bdev,
 966                         sector_t block, int size)
 967 {
 968         struct buffer_head *head = page_buffers(page);
 969         struct buffer_head *bh = head;
 970         int uptodate = PageUptodate(page);
 971
 972         do {
 973                 if (!buffer_mapped(bh)) {
 974                         init_buffer(bh, NULL, NULL);
 975                         bh->b_bdev = bdev;
 976                         bh->b_blocknr = block;
 977                         if (uptodate)
 978                                 set_buffer_uptodate(bh);
 979                         set_buffer_mapped(bh);
 980                 }
 981                 block++;
 982                 bh = bh->b_this_page;
 983         } while (bh != head);
 984 }
 985
 986 /*
 987  * Create the page-cache page that contains the requested block.
 988  *
 989  * This is user purely for blockdev mappings.
 990  */
 991 static struct page *
 992 grow_dev_page(struct block_device *bdev, sector_t block,
 993                 pgoff_t index, int size)
 994 {
 995         struct inode *inode = bdev->bd_inode;
 996         struct page *page;
 997         struct buffer_head *bh;
 998
 999         page = find_or_create_page(inode->i_mapping, index,
1000                 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1001         if (!page)
1002                 return NULL;
1003
1004         BUG_ON(!PageLocked(page));
1005
1006         if (page_has_buffers(page)) {
1007                 bh = page_buffers(page);
1008                 if (bh->b_size == size) {
1009                         init_page_buffers(page, bdev, block, size);
1010                         return page;
1011                 }
1012                 if (!try_to_free_buffers(page))
1013                         goto failed;
1014         }
1015
1016         /*
1017          * Allocate some buffers for this page
1018          */
1019         bh = alloc_page_buffers(page, size, 0);
1020         if (!bh)
1021                 goto failed;
1022
1023         /*
1024          * Link the page to the buffers and initialise them.  Take the
1025          * lock to be atomic wrt __find_get_block(), which does not
1026          * run under the page lock.
1027          */
1028         spin_lock(&inode->i_mapping->private_lock);
1029         link_dev_buffers(page, bh);
1030         init_page_buffers(page, bdev, block, size);
1031         spin_unlock(&inode->i_mapping->private_lock);
1032         return page;
1033
1034 failed:
1035         BUG();
1036         unlock_page(page);
1037         page_cache_release(page);
1038         return NULL;
1039 }
1040
1041 /*
1042  * Create buffers for the specified block device block's page.  If
1043  * that page was dirty, the buffers are set dirty also.
1044  */
1045 static int
1046 grow_buffers(struct block_device *bdev, sector_t block, int size)
1047 {
1048         struct page *page;
1049         pgoff_t index;
1050         int sizebits;
1051
1052         sizebits = -1;
1053         do {
1054                 sizebits++;
1055         } while ((size << sizebits) < PAGE_SIZE);
1056
1057         index = block >> sizebits;
1058
1059         /*
1060          * Check for a block which wants to lie outside our maximum possible
1061          * pagecache index.  (this comparison is done using sector_t types).
1062          */
1063         if (unlikely(index != block >> sizebits)) {
1064                 char b[BDEVNAME_SIZE];
1065
1066                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1067                         "device %s\n",
1068                         __func__, (unsigned long long)block,
1069                         bdevname(bdev, b));
1070                 return -EIO;
1071         }
1072         block = index << sizebits;
1073         /* Create a page with the proper size buffers.. */
1074         page = grow_dev_page(bdev, block, index, size);
1075         if (!page)
1076                 return 0;
1077         unlock_page(page);
1078         page_cache_release(page);
1079         return 1;
1080 }
1081
1082 static struct buffer_head *
1083 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1084 {
1085         /* Size must be multiple of hard sectorsize */
1086         if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087                         (size < 512 || size > PAGE_SIZE))) {
1088                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089                                         size);
1090                 printk(KERN_ERR "logical block size: %d\n",
1091                                         bdev_logical_block_size(bdev));
1092
1093                 dump_stack();
1094                 return NULL;
1095         }
1096
1097         for (;;) {
1098                 struct buffer_head * bh;
1099                 int ret;
1100
1101                 bh = __find_get_block(bdev, block, size);
1102                 if (bh)
1103                         return bh;
1104
1105                 ret = grow_buffers(bdev, block, size);
1106                 if (ret < 0)
1107                         return NULL;
1108                 if (ret == 0)
1109                         free_more_memory();
1110         }
1111 }
1112
1113 /*
1114  * The relationship between dirty buffers and dirty pages:
1115  *
1116  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117  * the page is tagged dirty in its radix tree.
1118  *
1119  * At all times, the dirtiness of the buffers represents the dirtiness of
1120  * subsections of the page.  If the page has buffers, the page dirty bit is
1121  * merely a hint about the true dirty state.
1122  *
1123  * When a page is set dirty in its entirety, all its buffers are marked dirty
1124  * (if the page has buffers).
1125  *
1126  * When a buffer is marked dirty, its page is dirtied, but the page's other
1127  * buffers are not.
1128  *
1129  * Also.  When blockdev buffers are explicitly read with bread(), they
1130  * individually become uptodate.  But their backing page remains not
1131  * uptodate - even if all of its buffers are uptodate.  A subsequent
1132  * block_read_full_page() against that page will discover all the uptodate
1133  * buffers, will set the page uptodate and will perform no I/O.
1134  */
1135
1136 /**
1137  * mark_buffer_dirty - mark a buffer_head as needing writeout
1138  * @bh: the buffer_head to mark dirty
1139  *
1140  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141  * backing page dirty, then tag the page as dirty in its address_space's radix
1142  * tree and then attach the address_space's inode to its superblock's dirty
1143  * inode list.
1144  *
1145  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1146  * mapping->tree_lock and the global inode_lock.
1147  */
1148 void mark_buffer_dirty(struct buffer_head *bh)
1149 {
1150         WARN_ON_ONCE(!buffer_uptodate(bh));
1151
1152         /*
1153          * Very *carefully* optimize the it-is-already-dirty case.
1154          *
1155          * Don't let the final "is it dirty" escape to before we
1156          * perhaps modified the buffer.
1157          */
1158         if (buffer_dirty(bh)) {
1159                 smp_mb();
1160                 if (buffer_dirty(bh))
1161                         return;
1162         }
1163
1164         if (!test_set_buffer_dirty(bh)) {
1165                 struct page *page = bh->b_page;
1166                 if (!TestSetPageDirty(page)) {
1167                         struct address_space *mapping = page_mapping(page);
1168                         if (mapping)
1169                                 __set_page_dirty(page, mapping, 0);
1170                 }
1171         }
1172 }
1173 EXPORT_SYMBOL(mark_buffer_dirty);
1174
1175 /*
1176  * Decrement a buffer_head's reference count.  If all buffers against a page
1177  * have zero reference count, are clean and unlocked, and if the page is clean
1178  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1179  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1180  * a page but it ends up not being freed, and buffers may later be reattached).
1181  */
1182 void __brelse(struct buffer_head * buf)
1183 {
1184         if (atomic_read(&buf->b_count)) {
1185                 put_bh(buf);
1186                 return;
1187         }
1188         WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189 }
1190 EXPORT_SYMBOL(__brelse);
1191
1192 /*
1193  * bforget() is like brelse(), except it discards any
1194  * potentially dirty data.
1195  */
1196 void __bforget(struct buffer_head *bh)
1197 {
1198         clear_buffer_dirty(bh);
1199         if (bh->b_assoc_map) {
1200                 struct address_space *buffer_mapping = bh->b_page->mapping;
1201
1202                 spin_lock(&buffer_mapping->private_lock);
1203                 list_del_init(&bh->b_assoc_buffers);
1204                 bh->b_assoc_map = NULL;
1205                 spin_unlock(&buffer_mapping->private_lock);
1206         }
1207         __brelse(bh);
1208 }
1209 EXPORT_SYMBOL(__bforget);
1210
1211 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212 {
1213         lock_buffer(bh);
1214         if (buffer_uptodate(bh)) {
1215                 unlock_buffer(bh);
1216                 return bh;
1217         } else {
1218                 get_bh(bh);
1219                 bh->b_end_io = end_buffer_read_sync;
1220                 submit_bh(READ, bh);
1221                 wait_on_buffer(bh);
1222                 if (buffer_uptodate(bh))
1223                         return bh;
1224         }
1225         brelse(bh);
1226         return NULL;
1227 }
1228
1229 /*
1230  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1231  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1232  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1233  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1234  * CPU's LRUs at the same time.
1235  *
1236  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237  * sb_find_get_block().
1238  *
1239  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1240  * a local interrupt disable for that.
1241  */
1242
1243 #define BH_LRU_SIZE     8
1244
1245 struct bh_lru {
1246         struct buffer_head *bhs[BH_LRU_SIZE];
1247 };
1248
1249 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250
1251 #ifdef CONFIG_SMP
1252 #define bh_lru_lock()   local_irq_disable()
1253 #define bh_lru_unlock() local_irq_enable()
1254 #else
1255 #define bh_lru_lock()   preempt_disable()
1256 #define bh_lru_unlock() preempt_enable()
1257 #endif
1258
1259 static inline void check_irqs_on(void)
1260 {
1261 #ifdef irqs_disabled
1262         BUG_ON(irqs_disabled());
1263 #endif
1264 }
1265
1266 /*
1267  * The LRU management algorithm is dopey-but-simple.  Sorry.
1268  */
1269 static void bh_lru_install(struct buffer_head *bh)
1270 {
1271         struct buffer_head *evictee = NULL;
1272         struct bh_lru *lru;
1273
1274         check_irqs_on();
1275         bh_lru_lock();
1276         lru = &__get_cpu_var(bh_lrus);
1277         if (lru->bhs[0] != bh) {
1278                 struct buffer_head *bhs[BH_LRU_SIZE];
1279                 int in;
1280                 int out = 0;
1281
1282                 get_bh(bh);
1283                 bhs[out++] = bh;
1284                 for (in = 0; in < BH_LRU_SIZE; in++) {
1285                         struct buffer_head *bh2 = lru->bhs[in];
1286
1287                         if (bh2 == bh) {
1288                                 __brelse(bh2);
1289                         } else {
1290                                 if (out >= BH_LRU_SIZE) {
1291                                         BUG_ON(evictee != NULL);
1292                                         evictee = bh2;
1293                                 } else {
1294                                         bhs[out++] = bh2;
1295                                 }
1296                         }
1297                 }
1298                 while (out < BH_LRU_SIZE)
1299                         bhs[out++] = NULL;
1300                 memcpy(lru->bhs, bhs, sizeof(bhs));
1301         }
1302         bh_lru_unlock();
1303
1304         if (evictee)
1305                 __brelse(evictee);
1306 }
1307
1308 /*
1309  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1310  */
1311 static struct buffer_head *
1312 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1313 {
1314         struct buffer_head *ret = NULL;
1315         struct bh_lru *lru;
1316         unsigned int i;
1317
1318         check_irqs_on();
1319         bh_lru_lock();
1320         lru = &__get_cpu_var(bh_lrus);
1321         for (i = 0; i < BH_LRU_SIZE; i++) {
1322                 struct buffer_head *bh = lru->bhs[i];
1323
1324                 if (bh && bh->b_bdev == bdev &&
1325                                 bh->b_blocknr == block && bh->b_size == size) {
1326                         if (i) {
1327                                 while (i) {
1328                                         lru->bhs[i] = lru->bhs[i - 1];
1329                                         i--;
1330                                 }
1331                                 lru->bhs[0] = bh;
1332                         }
1333                         get_bh(bh);
1334                         ret = bh;
1335                         break;
1336                 }
1337         }
1338         bh_lru_unlock();
1339         return ret;
1340 }
1341
1342 /*
1343  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1344  * it in the LRU and mark it as accessed.  If it is not present then return
1345  * NULL
1346  */
1347 struct buffer_head *
1348 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1349 {
1350         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1351
1352         if (bh == NULL) {
1353                 bh = __find_get_block_slow(bdev, block);
1354                 if (bh)
1355                         bh_lru_install(bh);
1356         }
1357         if (bh)
1358                 touch_buffer(bh);
1359         return bh;
1360 }
1361 EXPORT_SYMBOL(__find_get_block);
1362
1363 /*
1364  * __getblk will locate (and, if necessary, create) the buffer_head
1365  * which corresponds to the passed block_device, block and size. The
1366  * returned buffer has its reference count incremented.
1367  *
1368  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1369  * illegal block number, __getblk() will happily return a buffer_head
1370  * which represents the non-existent block.  Very weird.
1371  *
1372  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1373  * attempt is failing.  FIXME, perhaps?
1374  */
1375 struct buffer_head *
1376 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1377 {
1378         struct buffer_head *bh = __find_get_block(bdev, block, size);
1379
1380         might_sleep();
1381         if (bh == NULL)
1382                 bh = __getblk_slow(bdev, block, size);
1383         return bh;
1384 }
1385 EXPORT_SYMBOL(__getblk);
1386
1387 /*
1388  * Do async read-ahead on a buffer..
1389  */
1390 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1391 {
1392         struct buffer_head *bh = __getblk(bdev, block, size);
1393         if (likely(bh)) {
1394                 ll_rw_block(READA, 1, &bh);
1395                 brelse(bh);
1396         }
1397 }
1398 EXPORT_SYMBOL(__breadahead);
1399
1400 /**
1401  *  __bread() - reads a specified block and returns the bh
1402  *  @bdev: the block_device to read from
1403  *  @block: number of block
1404  *  @size: size (in bytes) to read
1405  *
1406  *  Reads a specified block, and returns buffer head that contains it.
1407  *  It returns NULL if the block was unreadable.
1408  */
1409 struct buffer_head *
1410 __bread(struct block_device *bdev, sector_t block, unsigned size)
1411 {
1412         struct buffer_head *bh = __getblk(bdev, block, size);
1413
1414         if (likely(bh) && !buffer_uptodate(bh))
1415                 bh = __bread_slow(bh);
1416         return bh;
1417 }
1418 EXPORT_SYMBOL(__bread);
1419
1420 /*
1421  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1422  * This doesn't race because it runs in each cpu either in irq
1423  * or with preempt disabled.
1424  */
1425 static void invalidate_bh_lru(void *arg)
1426 {
1427         struct bh_lru *b = &get_cpu_var(bh_lrus);
1428         int i;
1429
1430         for (i = 0; i < BH_LRU_SIZE; i++) {
1431                 brelse(b->bhs[i]);
1432                 b->bhs[i] = NULL;
1433         }
1434         put_cpu_var(bh_lrus);
1435 }
1436
1437 void invalidate_bh_lrus(void)
1438 {
1439         on_each_cpu(invalidate_bh_lru, NULL, 1);
1440 }
1441 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1442
1443 void set_bh_page(struct buffer_head *bh,
1444                 struct page *page, unsigned long offset)
1445 {
1446         bh->b_page = page;
1447         BUG_ON(offset >= PAGE_SIZE);
1448         if (PageHighMem(page))
1449                 /*
1450                  * This catches illegal uses and preserves the offset:
1451                  */
1452                 bh->b_data = (char *)(0 + offset);
1453         else
1454                 bh->b_data = page_address(page) + offset;
1455 }
1456 EXPORT_SYMBOL(set_bh_page);
1457
1458 /*
1459  * Called when truncating a buffer on a page completely.
1460  */
1461 static void discard_buffer(struct buffer_head * bh)
1462 {
1463         lock_buffer(bh);
1464         clear_buffer_dirty(bh);
1465         bh->b_bdev = NULL;
1466         clear_buffer_mapped(bh);
1467         clear_buffer_req(bh);
1468         clear_buffer_new(bh);
1469         clear_buffer_delay(bh);
1470         clear_buffer_unwritten(bh);
1471         unlock_buffer(bh);
1472 }
1473
1474 /**
1475  * block_invalidatepage - invalidate part of all of a buffer-backed page
1476  *
1477  * @page: the page which is affected
1478  * @offset: the index of the truncation point
1479  *
1480  * block_invalidatepage() is called when all or part of the page has become
1481  * invalidatedby a truncate operation.
1482  *
1483  * block_invalidatepage() does not have to release all buffers, but it must
1484  * ensure that no dirty buffer is left outside @offset and that no I/O
1485  * is underway against any of the blocks which are outside the truncation
1486  * point.  Because the caller is about to free (and possibly reuse) those
1487  * blocks on-disk.
1488  */
1489 void block_invalidatepage(struct page *page, unsigned long offset)
1490 {
1491         struct buffer_head *head, *bh, *next;
1492         unsigned int curr_off = 0;
1493
1494         BUG_ON(!PageLocked(page));
1495         if (!page_has_buffers(page))
1496                 goto out;
1497
1498         head = page_buffers(page);
1499         bh = head;
1500         do {
1501                 unsigned int next_off = curr_off + bh->b_size;
1502                 next = bh->b_this_page;
1503
1504                 /*
1505                  * is this block fully invalidated?
1506                  */
1507                 if (offset <= curr_off)
1508                         discard_buffer(bh);
1509                 curr_off = next_off;
1510                 bh = next;
1511         } while (bh != head);
1512
1513         /*
1514          * We release buffers only if the entire page is being invalidated.
1515          * The get_block cached value has been unconditionally invalidated,
1516          * so real IO is not possible anymore.
1517          */
1518         if (offset == 0)
1519                 try_to_release_page(page, 0);
1520 out:
1521         return;
1522 }
1523 EXPORT_SYMBOL(block_invalidatepage);
1524
1525 /*
1526  * We attach and possibly dirty the buffers atomically wrt
1527  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1528  * is already excluded via the page lock.
1529  */
1530 void create_empty_buffers(struct page *page,
1531                         unsigned long blocksize, unsigned long b_state)
1532 {
1533         struct buffer_head *bh, *head, *tail;
1534
1535         head = alloc_page_buffers(page, blocksize, 1);
1536         bh = head;
1537         do {
1538                 bh->b_state |= b_state;
1539                 tail = bh;
1540                 bh = bh->b_this_page;
1541         } while (bh);
1542         tail->b_this_page = head;
1543
1544         spin_lock(&page->mapping->private_lock);
1545         if (PageUptodate(page) || PageDirty(page)) {
1546                 bh = head;
1547                 do {
1548                         if (PageDirty(page))
1549                                 set_buffer_dirty(bh);
1550                         if (PageUptodate(page))
1551                                 set_buffer_uptodate(bh);
1552                         bh = bh->b_this_page;
1553                 } while (bh != head);
1554         }
1555         attach_page_buffers(page, head);
1556         spin_unlock(&page->mapping->private_lock);
1557 }
1558 EXPORT_SYMBOL(create_empty_buffers);
1559
1560 /*
1561  * We are taking a block for data and we don't want any output from any
1562  * buffer-cache aliases starting from return from that function and
1563  * until the moment when something will explicitly mark the buffer
1564  * dirty (hopefully that will not happen until we will free that block ;-)
1565  * We don't even need to mark it not-uptodate - nobody can expect
1566  * anything from a newly allocated buffer anyway. We used to used
1567  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1568  * don't want to mark the alias unmapped, for example - it would confuse
1569  * anyone who might pick it with bread() afterwards...
1570  *
1571  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1572  * be writeout I/O going on against recently-freed buffers.  We don't
1573  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1574  * only if we really need to.  That happens here.
1575  */
1576 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1577 {
1578         struct buffer_head *old_bh;
1579
1580         might_sleep();
1581
1582         old_bh = __find_get_block_slow(bdev, block);
1583         if (old_bh) {
1584                 clear_buffer_dirty(old_bh);
1585                 wait_on_buffer(old_bh);
1586                 clear_buffer_req(old_bh);
1587                 __brelse(old_bh);
1588         }
1589 }
1590 EXPORT_SYMBOL(unmap_underlying_metadata);
1591
1592 /*
1593  * NOTE! All mapped/uptodate combinations are valid:
1594  *
1595  *      Mapped  Uptodate        Meaning
1596  *
1597  *      No      No              "unknown" - must do get_block()
1598  *      No      Yes             "hole" - zero-filled
1599  *      Yes     No              "allocated" - allocated on disk, not read in
1600  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1601  *
1602  * "Dirty" is valid only with the last case (mapped+uptodate).
1603  */
1604
1605 /*
1606  * While block_write_full_page is writing back the dirty buffers under
1607  * the page lock, whoever dirtied the buffers may decide to clean them
1608  * again at any time.  We handle that by only looking at the buffer
1609  * state inside lock_buffer().
1610  *
1611  * If block_write_full_page() is called for regular writeback
1612  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1613  * locked buffer.   This only can happen if someone has written the buffer
1614  * directly, with submit_bh().  At the address_space level PageWriteback
1615  * prevents this contention from occurring.
1616  *
1617  * If block_write_full_page() is called with wbc->sync_mode ==
1618  * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1619  * causes the writes to be flagged as synchronous writes, but the
1620  * block device queue will NOT be unplugged, since usually many pages
1621  * will be pushed to the out before the higher-level caller actually
1622  * waits for the writes to be completed.  The various wait functions,
1623  * such as wait_on_writeback_range() will ultimately call sync_page()
1624  * which will ultimately call blk_run_backing_dev(), which will end up
1625  * unplugging the device queue.
1626  */
1627 static int __block_write_full_page(struct inode *inode, struct page *page,
1628                         get_block_t *get_block, struct writeback_control *wbc,
1629                         bh_end_io_t *handler)
1630 {
1631         int err;
1632         sector_t block;
1633         sector_t last_block;
1634         struct buffer_head *bh, *head;
1635         const unsigned blocksize = 1 << inode->i_blkbits;
1636         int nr_underway = 0;
1637         int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1638                         WRITE_SYNC_PLUG : WRITE);
1639
1640         BUG_ON(!PageLocked(page));
1641
1642         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1643
1644         if (!page_has_buffers(page)) {
1645                 create_empty_buffers(page, blocksize,
1646                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1647         }
1648
1649         /*
1650          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1651          * here, and the (potentially unmapped) buffers may become dirty at
1652          * any time.  If a buffer becomes dirty here after we've inspected it
1653          * then we just miss that fact, and the page stays dirty.
1654          *
1655          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1656          * handle that here by just cleaning them.
1657          */
1658
1659         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1660         head = page_buffers(page);
1661         bh = head;
1662
1663         /*
1664          * Get all the dirty buffers mapped to disk addresses and
1665          * handle any aliases from the underlying blockdev's mapping.
1666          */
1667         do {
1668                 if (block > last_block) {
1669                         /*
1670                          * mapped buffers outside i_size will occur, because
1671                          * this page can be outside i_size when there is a
1672                          * truncate in progress.
1673                          */
1674                         /*
1675                          * The buffer was zeroed by block_write_full_page()
1676                          */
1677                         clear_buffer_dirty(bh);
1678                         set_buffer_uptodate(bh);
1679                 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1680                            buffer_dirty(bh)) {
1681                         WARN_ON(bh->b_size != blocksize);
1682                         err = get_block(inode, block, bh, 1);
1683                         if (err)
1684                                 goto recover;
1685                         clear_buffer_delay(bh);
1686                         if (buffer_new(bh)) {
1687                                 /* blockdev mappings never come here */
1688                                 clear_buffer_new(bh);
1689                                 unmap_underlying_metadata(bh->b_bdev,
1690                                                         bh->b_blocknr);
1691                         }
1692                 }
1693                 bh = bh->b_this_page;
1694                 block++;
1695         } while (bh != head);
1696
1697         do {
1698                 if (!buffer_mapped(bh))
1699                         continue;
1700                 /*
1701                  * If it's a fully non-blocking write attempt and we cannot
1702                  * lock the buffer then redirty the page.  Note that this can
1703                  * potentially cause a busy-wait loop from writeback threads
1704                  * and kswapd activity, but those code paths have their own
1705                  * higher-level throttling.
1706                  */
1707                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1708                         lock_buffer(bh);
1709                 } else if (!trylock_buffer(bh)) {
1710                         redirty_page_for_writepage(wbc, page);
1711                         continue;
1712                 }
1713                 if (test_clear_buffer_dirty(bh)) {
1714                         mark_buffer_async_write_endio(bh, handler);
1715                 } else {
1716                         unlock_buffer(bh);
1717                 }
1718         } while ((bh = bh->b_this_page) != head);
1719
1720         /*
1721          * The page and its buffers are protected by PageWriteback(), so we can
1722          * drop the bh refcounts early.
1723          */
1724         BUG_ON(PageWriteback(page));
1725         set_page_writeback(page);
1726
1727         do {
1728                 struct buffer_head *next = bh->b_this_page;
1729                 if (buffer_async_write(bh)) {
1730                         submit_bh(write_op, bh);
1731                         nr_underway++;
1732                 }
1733                 bh = next;
1734         } while (bh != head);
1735         unlock_page(page);
1736
1737         err = 0;
1738 done:
1739         if (nr_underway == 0) {
1740                 /*
1741                  * The page was marked dirty, but the buffers were
1742                  * clean.  Someone wrote them back by hand with
1743                  * ll_rw_block/submit_bh.  A rare case.
1744                  */
1745                 end_page_writeback(page);
1746
1747                 /*
1748                  * The page and buffer_heads can be released at any time from
1749                  * here on.
1750                  */
1751         }
1752         return err;
1753
1754 recover:
1755         /*
1756          * ENOSPC, or some other error.  We may already have added some
1757          * blocks to the file, so we need to write these out to avoid
1758          * exposing stale data.
1759          * The page is currently locked and not marked for writeback
1760          */
1761         bh = head;
1762         /* Recovery: lock and submit the mapped buffers */
1763         do {
1764                 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1765                     !buffer_delay(bh)) {
1766                         lock_buffer(bh);
1767                         mark_buffer_async_write_endio(bh, handler);
1768                 } else {
1769                         /*
1770                          * The buffer may have been set dirty during
1771                          * attachment to a dirty page.
1772                          */
1773                         clear_buffer_dirty(bh);
1774                 }
1775         } while ((bh = bh->b_this_page) != head);
1776         SetPageError(page);
1777         BUG_ON(PageWriteback(page));
1778         mapping_set_error(page->mapping, err);
1779         set_page_writeback(page);
1780         do {
1781                 struct buffer_head *next = bh->b_this_page;
1782                 if (buffer_async_write(bh)) {
1783                         clear_buffer_dirty(bh);
1784                         submit_bh(write_op, bh);
1785                         nr_underway++;
1786                 }
1787                 bh = next;
1788         } while (bh != head);
1789         unlock_page(page);
1790         goto done;
1791 }
1792
1793 /*
1794  * If a page has any new buffers, zero them out here, and mark them uptodate
1795  * and dirty so they'll be written out (in order to prevent uninitialised
1796  * block data from leaking). And clear the new bit.
1797  */
1798 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1799 {
1800         unsigned int block_start, block_end;
1801         struct buffer_head *head, *bh;
1802
1803         BUG_ON(!PageLocked(page));
1804         if (!page_has_buffers(page))
1805                 return;
1806
1807         bh = head = page_buffers(page);
1808         block_start = 0;
1809         do {
1810                 block_end = block_start + bh->b_size;
1811
1812                 if (buffer_new(bh)) {
1813                         if (block_end > from && block_start < to) {
1814                                 if (!PageUptodate(page)) {
1815                                         unsigned start, size;
1816
1817                                         start = max(from, block_start);
1818                                         size = min(to, block_end) - start;
1819
1820                                         zero_user(page, start, size);
1821                                         set_buffer_uptodate(bh);
1822                                 }
1823
1824                                 clear_buffer_new(bh);
1825                                 mark_buffer_dirty(bh);
1826                         }
1827                 }
1828
1829                 block_start = block_end;
1830                 bh = bh->b_this_page;
1831         } while (bh != head);
1832 }
1833 EXPORT_SYMBOL(page_zero_new_buffers);
1834
1835 static int __block_prepare_write(struct inode *inode, struct page *page,
1836                 unsigned from, unsigned to, get_block_t *get_block)
1837 {
1838         unsigned block_start, block_end;
1839         sector_t block;
1840         int err = 0;
1841         unsigned blocksize, bbits;
1842         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1843
1844         BUG_ON(!PageLocked(page));
1845         BUG_ON(from > PAGE_CACHE_SIZE);
1846         BUG_ON(to > PAGE_CACHE_SIZE);
1847         BUG_ON(from > to);
1848
1849         blocksize = 1 << inode->i_blkbits;
1850         if (!page_has_buffers(page))
1851                 create_empty_buffers(page, blocksize, 0);
1852         head = page_buffers(page);
1853
1854         bbits = inode->i_blkbits;
1855         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1856
1857         for(bh = head, block_start = 0; bh != head || !block_start;
1858             block++, block_start=block_end, bh = bh->b_this_page) {
1859                 block_end = block_start + blocksize;
1860                 if (block_end <= from || block_start >= to) {
1861                         if (PageUptodate(page)) {
1862                                 if (!buffer_uptodate(bh))
1863                                         set_buffer_uptodate(bh);
1864                         }
1865                         continue;
1866                 }
1867                 if (buffer_new(bh))
1868                         clear_buffer_new(bh);
1869                 if (!buffer_mapped(bh)) {
1870                         WARN_ON(bh->b_size != blocksize);
1871                         err = get_block(inode, block, bh, 1);
1872                         if (err)
1873                                 break;
1874                         if (buffer_new(bh)) {
1875                                 unmap_underlying_metadata(bh->b_bdev,
1876                                                         bh->b_blocknr);
1877                                 if (PageUptodate(page)) {
1878                                         clear_buffer_new(bh);
1879                                         set_buffer_uptodate(bh);
1880                                         mark_buffer_dirty(bh);
1881                                         continue;
1882                                 }
1883                                 if (block_end > to || block_start < from)
1884                                         zero_user_segments(page,
1885                                                 to, block_end,
1886                                                 block_start, from);
1887                                 continue;
1888                         }
1889                 }
1890                 if (PageUptodate(page)) {
1891                         if (!buffer_uptodate(bh))
1892                                 set_buffer_uptodate(bh);
1893                         continue;
1894                 }
1895                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1896                     !buffer_unwritten(bh) &&
1897                      (block_start < from || block_end > to)) {
1898                         ll_rw_block(READ, 1, &bh);
1899                         *wait_bh++=bh;
1900                 }
1901         }
1902         /*
1903          * If we issued read requests - let them complete.
1904          */
1905         while(wait_bh > wait) {
1906                 wait_on_buffer(*--wait_bh);
1907                 if (!buffer_uptodate(*wait_bh))
1908                         err = -EIO;
1909         }
1910         if (unlikely(err))
1911                 page_zero_new_buffers(page, from, to);
1912         return err;
1913 }
1914
1915 static int __block_commit_write(struct inode *inode, struct page *page,
1916                 unsigned from, unsigned to)
1917 {
1918         unsigned block_start, block_end;
1919         int partial = 0;
1920         unsigned blocksize;
1921         struct buffer_head *bh, *head;
1922
1923         blocksize = 1 << inode->i_blkbits;
1924
1925         for(bh = head = page_buffers(page), block_start = 0;
1926             bh != head || !block_start;
1927             block_start=block_end, bh = bh->b_this_page) {
1928                 block_end = block_start + blocksize;
1929                 if (block_end <= from || block_start >= to) {
1930                         if (!buffer_uptodate(bh))
1931                                 partial = 1;
1932                 } else {
1933                         set_buffer_uptodate(bh);
1934                         mark_buffer_dirty(bh);
1935                 }
1936                 clear_buffer_new(bh);
1937         }
1938
1939         /*
1940          * If this is a partial write which happened to make all buffers
1941          * uptodate then we can optimize away a bogus readpage() for
1942          * the next read(). Here we 'discover' whether the page went
1943          * uptodate as a result of this (potentially partial) write.
1944          */
1945         if (!partial)
1946                 SetPageUptodate(page);
1947         return 0;
1948 }
1949
1950 /*
1951  * block_write_begin takes care of the basic task of block allocation and
1952  * bringing partial write blocks uptodate first.
1953  *
1954  * If *pagep is not NULL, then block_write_begin uses the locked page
1955  * at *pagep rather than allocating its own. In this case, the page will
1956  * not be unlocked or deallocated on failure.
1957  */
1958 int block_write_begin(struct file *file, struct address_space *mapping,
1959                         loff_t pos, unsigned len, unsigned flags,
1960                         struct page **pagep, void **fsdata,
1961                         get_block_t *get_block)
1962 {
1963         struct inode *inode = mapping->host;
1964         int status = 0;
1965         struct page *page;
1966         pgoff_t index;
1967         unsigned start, end;
1968         int ownpage = 0;
1969
1970         index = pos >> PAGE_CACHE_SHIFT;
1971         start = pos & (PAGE_CACHE_SIZE - 1);
1972         end = start + len;
1973
1974         page = *pagep;
1975         if (page == NULL) {
1976                 ownpage = 1;
1977                 page = grab_cache_page_write_begin(mapping, index, flags);
1978                 if (!page) {
1979                         status = -ENOMEM;
1980                         goto out;
1981                 }
1982                 *pagep = page;
1983         } else
1984                 BUG_ON(!PageLocked(page));
1985
1986         status = __block_prepare_write(inode, page, start, end, get_block);
1987         if (unlikely(status)) {
1988                 ClearPageUptodate(page);
1989
1990                 if (ownpage) {
1991                         unlock_page(page);
1992                         page_cache_release(page);
1993                         *pagep = NULL;
1994
1995                         /*
1996                          * prepare_write() may have instantiated a few blocks
1997                          * outside i_size.  Trim these off again. Don't need
1998                          * i_size_read because we hold i_mutex.
1999                          */
2000                         if (pos + len > inode->i_size)
2001                                 vmtruncate(inode, inode->i_size);
2002                 }
2003         }
2004
2005 out:
2006         return status;
2007 }
2008 EXPORT_SYMBOL(block_write_begin);
2009
2010 int block_write_end(struct file *file, struct address_space *mapping,
2011                         loff_t pos, unsigned len, unsigned copied,
2012                         struct page *page, void *fsdata)
2013 {
2014         struct inode *inode = mapping->host;
2015         unsigned start;
2016
2017         start = pos & (PAGE_CACHE_SIZE - 1);
2018
2019         if (unlikely(copied < len)) {
2020                 /*
2021                  * The buffers that were written will now be uptodate, so we
2022                  * don't have to worry about a readpage reading them and
2023                  * overwriting a partial write. However if we have encountered
2024                  * a short write and only partially written into a buffer, it
2025                  * will not be marked uptodate, so a readpage might come in and
2026                  * destroy our partial write.
2027                  *
2028                  * Do the simplest thing, and just treat any short write to a
2029                  * non uptodate page as a zero-length write, and force the
2030                  * caller to redo the whole thing.
2031                  */
2032                 if (!PageUptodate(page))
2033                         copied = 0;
2034
2035                 page_zero_new_buffers(page, start+copied, start+len);
2036         }
2037         flush_dcache_page(page);
2038
2039         /* This could be a short (even 0-length) commit */
2040         __block_commit_write(inode, page, start, start+copied);
2041
2042         return copied;
2043 }
2044 EXPORT_SYMBOL(block_write_end);
2045
2046 int generic_write_end(struct file *file, struct address_space *mapping,
2047                         loff_t pos, unsigned len, unsigned copied,
2048                         struct page *page, void *fsdata)
2049 {
2050         struct inode *inode = mapping->host;
2051         int i_size_changed = 0;
2052
2053         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2054
2055         /*
2056          * No need to use i_size_read() here, the i_size
2057          * cannot change under us because we hold i_mutex.
2058          *
2059          * But it's important to update i_size while still holding page lock:
2060          * page writeout could otherwise come in and zero beyond i_size.
2061          */
2062         if (pos+copied > inode->i_size) {
2063                 i_size_write(inode, pos+copied);
2064                 i_size_changed = 1;
2065         }
2066
2067         unlock_page(page);
2068         page_cache_release(page);
2069
2070         /*
2071          * Don't mark the inode dirty under page lock. First, it unnecessarily
2072          * makes the holding time of page lock longer. Second, it forces lock
2073          * ordering of page lock and transaction start for journaling
2074          * filesystems.
2075          */
2076         if (i_size_changed)
2077                 mark_inode_dirty(inode);
2078
2079         return copied;
2080 }
2081 EXPORT_SYMBOL(generic_write_end);
2082
2083 /*
2084  * block_is_partially_uptodate checks whether buffers within a page are
2085  * uptodate or not.
2086  *
2087  * Returns true if all buffers which correspond to a file portion
2088  * we want to read are uptodate.
2089  */
2090 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2091                                         unsigned long from)
2092 {
2093         struct inode *inode = page->mapping->host;
2094         unsigned block_start, block_end, blocksize;
2095         unsigned to;
2096         struct buffer_head *bh, *head;
2097         int ret = 1;
2098
2099         if (!page_has_buffers(page))
2100                 return 0;
2101
2102         blocksize = 1 << inode->i_blkbits;
2103         to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2104         to = from + to;
2105         if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2106                 return 0;
2107
2108         head = page_buffers(page);
2109         bh = head;
2110         block_start = 0;
2111         do {
2112                 block_end = block_start + blocksize;
2113                 if (block_end > from && block_start < to) {
2114                         if (!buffer_uptodate(bh)) {
2115                                 ret = 0;
2116                                 break;
2117                         }
2118                         if (block_end >= to)
2119                                 break;
2120                 }
2121                 block_start = block_end;
2122                 bh = bh->b_this_page;
2123         } while (bh != head);
2124
2125         return ret;
2126 }
2127 EXPORT_SYMBOL(block_is_partially_uptodate);
2128
2129 /*
2130  * Generic "read page" function for block devices that have the normal
2131  * get_block functionality. This is most of the block device filesystems.
2132  * Reads the page asynchronously --- the unlock_buffer() and
2133  * set/clear_buffer_uptodate() functions propagate buffer state into the
2134  * page struct once IO has completed.
2135  */
2136 int block_read_full_page(struct page *page, get_block_t *get_block)
2137 {
2138         struct inode *inode = page->mapping->host;
2139         sector_t iblock, lblock;
2140         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2141         unsigned int blocksize;
2142         int nr, i;
2143         int fully_mapped = 1;
2144
2145         BUG_ON(!PageLocked(page));
2146         blocksize = 1 << inode->i_blkbits;
2147         if (!page_has_buffers(page))
2148                 create_empty_buffers(page, blocksize, 0);
2149         head = page_buffers(page);
2150
2151         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2152         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2153         bh = head;
2154         nr = 0;
2155         i = 0;
2156
2157         do {
2158                 if (buffer_uptodate(bh))
2159                         continue;
2160
2161                 if (!buffer_mapped(bh)) {
2162                         int err = 0;
2163
2164                         fully_mapped = 0;
2165                         if (iblock < lblock) {
2166                                 WARN_ON(bh->b_size != blocksize);
2167                                 err = get_block(inode, iblock, bh, 0);
2168                                 if (err)
2169                                         SetPageError(page);
2170                         }
2171                         if (!buffer_mapped(bh)) {
2172                                 zero_user(page, i * blocksize, blocksize);
2173                                 if (!err)
2174                                         set_buffer_uptodate(bh);
2175                                 continue;
2176                         }
2177                         /*
2178                          * get_block() might have updated the buffer
2179                          * synchronously
2180                          */
2181                         if (buffer_uptodate(bh))
2182                                 continue;
2183                 }
2184                 arr[nr++] = bh;
2185         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2186
2187         if (fully_mapped)
2188                 SetPageMappedToDisk(page);
2189
2190         if (!nr) {
2191                 /*
2192                  * All buffers are uptodate - we can set the page uptodate
2193                  * as well. But not if get_block() returned an error.
2194                  */
2195                 if (!PageError(page))
2196                         SetPageUptodate(page);
2197                 unlock_page(page);
2198                 return 0;
2199         }
2200
2201         /* Stage two: lock the buffers */
2202         for (i = 0; i < nr; i++) {
2203                 bh = arr[i];
2204                 lock_buffer(bh);
2205                 mark_buffer_async_read(bh);
2206         }
2207
2208         /*
2209          * Stage 3: start the IO.  Check for uptodateness
2210          * inside the buffer lock in case another process reading
2211          * the underlying blockdev brought it uptodate (the sct fix).
2212          */
2213         for (i = 0; i < nr; i++) {
2214                 bh = arr[i];
2215                 if (buffer_uptodate(bh))
2216                         end_buffer_async_read(bh, 1);
2217                 else
2218                         submit_bh(READ, bh);
2219         }
2220         return 0;
2221 }
2222 EXPORT_SYMBOL(block_read_full_page);
2223
2224 /* utility function for filesystems that need to do work on expanding
2225  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2226  * deal with the hole.
2227  */
2228 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2229 {
2230         struct address_space *mapping = inode->i_mapping;
2231         struct page *page;
2232         void *fsdata;
2233         int err;
2234
2235         err = inode_newsize_ok(inode, size);
2236         if (err)
2237                 goto out;
2238
2239         err = pagecache_write_begin(NULL, mapping, size, 0,
2240                                 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2241                                 &page, &fsdata);
2242         if (err)
2243                 goto out;
2244
2245         err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2246         BUG_ON(err > 0);
2247
2248 out:
2249         return err;
2250 }
2251 EXPORT_SYMBOL(generic_cont_expand_simple);
2252
2253 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2254                             loff_t pos, loff_t *bytes)
2255 {
2256         struct inode *inode = mapping->host;
2257         unsigned blocksize = 1 << inode->i_blkbits;
2258         struct page *page;
2259         void *fsdata;
2260         pgoff_t index, curidx;
2261         loff_t curpos;
2262         unsigned zerofrom, offset, len;
2263         int err = 0;
2264
2265         index = pos >> PAGE_CACHE_SHIFT;
2266         offset = pos & ~PAGE_CACHE_MASK;
2267
2268         while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2269                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2270                 if (zerofrom & (blocksize-1)) {
2271                         *bytes |= (blocksize-1);
2272                         (*bytes)++;
2273                 }
2274                 len = PAGE_CACHE_SIZE - zerofrom;
2275
2276                 err = pagecache_write_begin(file, mapping, curpos, len,
2277                                                 AOP_FLAG_UNINTERRUPTIBLE,
2278                                                 &page, &fsdata);
2279                 if (err)
2280                         goto out;
2281                 zero_user(page, zerofrom, len);
2282                 err = pagecache_write_end(file, mapping, curpos, len, len,
2283                                                 page, fsdata);
2284                 if (err < 0)
2285                         goto out;
2286                 BUG_ON(err != len);
2287                 err = 0;
2288
2289                 balance_dirty_pages_ratelimited(mapping);
2290         }
2291
2292         /* page covers the boundary, find the boundary offset */
2293         if (index == curidx) {
2294                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2295                 /* if we will expand the thing last block will be filled */
2296                 if (offset <= zerofrom) {
2297                         goto out;
2298                 }
2299                 if (zerofrom & (blocksize-1)) {
2300                         *bytes |= (blocksize-1);
2301                         (*bytes)++;
2302                 }
2303                 len = offset - zerofrom;
2304
2305                 err = pagecache_write_begin(file, mapping, curpos, len,
2306                                                 AOP_FLAG_UNINTERRUPTIBLE,
2307                                                 &page, &fsdata);
2308                 if (err)
2309                         goto out;
2310                 zero_user(page, zerofrom, len);
2311                 err = pagecache_write_end(file, mapping, curpos, len, len,
2312                                                 page, fsdata);
2313                 if (err < 0)
2314                         goto out;
2315                 BUG_ON(err != len);
2316                 err = 0;
2317         }
2318 out:
2319         return err;
2320 }
2321
2322 /*
2323  * For moronic filesystems that do not allow holes in file.
2324  * We may have to extend the file.
2325  */
2326 int cont_write_begin(struct file *file, struct address_space *mapping,
2327                         loff_t pos, unsigned len, unsigned flags,
2328                         struct page **pagep, void **fsdata,
2329                         get_block_t *get_block, loff_t *bytes)
2330 {
2331         struct inode *inode = mapping->host;
2332         unsigned blocksize = 1 << inode->i_blkbits;
2333         unsigned zerofrom;
2334         int err;
2335
2336         err = cont_expand_zero(file, mapping, pos, bytes);
2337         if (err)
2338                 goto out;
2339
2340         zerofrom = *bytes & ~PAGE_CACHE_MASK;
2341         if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2342                 *bytes |= (blocksize-1);
2343                 (*bytes)++;
2344         }
2345
2346         *pagep = NULL;
2347         err = block_write_begin(file, mapping, pos, len,
2348                                 flags, pagep, fsdata, get_block);
2349 out:
2350         return err;
2351 }
2352 EXPORT_SYMBOL(cont_write_begin);
2353
2354 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2355                         get_block_t *get_block)
2356 {
2357         struct inode *inode = page->mapping->host;
2358         int err = __block_prepare_write(inode, page, from, to, get_block);
2359         if (err)
2360                 ClearPageUptodate(page);
2361         return err;
2362 }
2363 EXPORT_SYMBOL(block_prepare_write);
2364
2365 int block_commit_write(struct page *page, unsigned from, unsigned to)
2366 {
2367         struct inode *inode = page->mapping->host;
2368         __block_commit_write(inode,page,from,to);
2369         return 0;
2370 }
2371 EXPORT_SYMBOL(block_commit_write);
2372
2373 /*
2374  * block_page_mkwrite() is not allowed to change the file size as it gets
2375  * called from a page fault handler when a page is first dirtied. Hence we must
2376  * be careful to check for EOF conditions here. We set the page up correctly
2377  * for a written page which means we get ENOSPC checking when writing into
2378  * holes and correct delalloc and unwritten extent mapping on filesystems that
2379  * support these features.
2380  *
2381  * We are not allowed to take the i_mutex here so we have to play games to
2382  * protect against truncate races as the page could now be beyond EOF.  Because
2383  * vmtruncate() writes the inode size before removing pages, once we have the
2384  * page lock we can determine safely if the page is beyond EOF. If it is not
2385  * beyond EOF, then the page is guaranteed safe against truncation until we
2386  * unlock the page.
2387  */
2388 int
2389 block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2390                    get_block_t get_block)
2391 {
2392         struct page *page = vmf->page;
2393         struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2394         unsigned long end;
2395         loff_t size;
2396         int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2397
2398         lock_page(page);
2399         size = i_size_read(inode);
2400         if ((page->mapping != inode->i_mapping) ||
2401             (page_offset(page) > size)) {
2402                 /* page got truncated out from underneath us */
2403                 unlock_page(page);
2404                 goto out;
2405         }
2406
2407         /* page is wholly or partially inside EOF */
2408         if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2409                 end = size & ~PAGE_CACHE_MASK;
2410         else
2411                 end = PAGE_CACHE_SIZE;
2412
2413         ret = block_prepare_write(page, 0, end, get_block);
2414         if (!ret)
2415                 ret = block_commit_write(page, 0, end);
2416
2417         if (unlikely(ret)) {
2418                 unlock_page(page);
2419                 if (ret == -ENOMEM)
2420                         ret = VM_FAULT_OOM;
2421                 else /* -ENOSPC, -EIO, etc */
2422                         ret = VM_FAULT_SIGBUS;
2423         } else
2424                 ret = VM_FAULT_LOCKED;
2425
2426 out:
2427         return ret;
2428 }
2429 EXPORT_SYMBOL(block_page_mkwrite);
2430
2431 /*
2432  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2433  * immediately, while under the page lock.  So it needs a special end_io
2434  * handler which does not touch the bh after unlocking it.
2435  */
2436 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2437 {
2438         __end_buffer_read_notouch(bh, uptodate);
2439 }
2440
2441 /*
2442  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2443  * the page (converting it to circular linked list and taking care of page
2444  * dirty races).
2445  */
2446 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2447 {
2448         struct buffer_head *bh;
2449
2450         BUG_ON(!PageLocked(page));
2451
2452         spin_lock(&page->mapping->private_lock);
2453         bh = head;
2454         do {
2455                 if (PageDirty(page))
2456                         set_buffer_dirty(bh);
2457                 if (!bh->b_this_page)
2458                         bh->b_this_page = head;
2459                 bh = bh->b_this_page;
2460         } while (bh != head);
2461         attach_page_buffers(page, head);
2462         spin_unlock(&page->mapping->private_lock);
2463 }
2464
2465 /*
2466  * On entry, the page is fully not uptodate.
2467  * On exit the page is fully uptodate in the areas outside (from,to)
2468  */
2469 int nobh_write_begin(struct file *file, struct address_space *mapping,
2470                         loff_t pos, unsigned len, unsigned flags,
2471                         struct page **pagep, void **fsdata,
2472                         get_block_t *get_block)
2473 {
2474         struct inode *inode = mapping->host;
2475         const unsigned blkbits = inode->i_blkbits;
2476         const unsigned blocksize = 1 << blkbits;
2477         struct buffer_head *head, *bh;
2478         struct page *page;
2479         pgoff_t index;
2480         unsigned from, to;
2481         unsigned block_in_page;
2482         unsigned block_start, block_end;
2483         sector_t block_in_file;
2484         int nr_reads = 0;
2485         int ret = 0;
2486         int is_mapped_to_disk = 1;
2487
2488         index = pos >> PAGE_CACHE_SHIFT;
2489         from = pos & (PAGE_CACHE_SIZE - 1);
2490         to = from + len;
2491
2492         page = grab_cache_page_write_begin(mapping, index, flags);
2493         if (!page)
2494                 return -ENOMEM;
2495         *pagep = page;
2496         *fsdata = NULL;
2497
2498         if (page_has_buffers(page)) {
2499                 unlock_page(page);
2500                 page_cache_release(page);
2501                 *pagep = NULL;
2502                 return block_write_begin(file, mapping, pos, len, flags, pagep,
2503                                         fsdata, get_block);
2504         }
2505
2506         if (PageMappedToDisk(page))
2507                 return 0;
2508
2509         /*
2510          * Allocate buffers so that we can keep track of state, and potentially
2511          * attach them to the page if an error occurs. In the common case of
2512          * no error, they will just be freed again without ever being attached
2513          * to the page (which is all OK, because we're under the page lock).
2514          *
2515          * Be careful: the buffer linked list is a NULL terminated one, rather
2516          * than the circular one we're used to.
2517          */
2518         head = alloc_page_buffers(page, blocksize, 0);
2519         if (!head) {
2520                 ret = -ENOMEM;
2521                 goto out_release;
2522         }
2523
2524         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2525
2526         /*
2527          * We loop across all blocks in the page, whether or not they are
2528          * part of the affected region.  This is so we can discover if the
2529          * page is fully mapped-to-disk.
2530          */
2531         for (block_start = 0, block_in_page = 0, bh = head;
2532                   block_start < PAGE_CACHE_SIZE;
2533                   block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2534                 int create;
2535
2536                 block_end = block_start + blocksize;
2537                 bh->b_state = 0;
2538                 create = 1;
2539                 if (block_start >= to)
2540                         create = 0;
2541                 ret = get_block(inode, block_in_file + block_in_page,
2542                                         bh, create);
2543                 if (ret)
2544                         goto failed;
2545                 if (!buffer_mapped(bh))
2546                         is_mapped_to_disk = 0;
2547                 if (buffer_new(bh))
2548                         unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2549                 if (PageUptodate(page)) {
2550                         set_buffer_uptodate(bh);
2551                         continue;
2552                 }
2553                 if (buffer_new(bh) || !buffer_mapped(bh)) {
2554                         zero_user_segments(page, block_start, from,
2555                                                         to, block_end);
2556                         continue;
2557                 }
2558                 if (buffer_uptodate(bh))
2559                         continue;       /* reiserfs does this */
2560                 if (block_start < from || block_end > to) {
2561                         lock_buffer(bh);
2562                         bh->b_end_io = end_buffer_read_nobh;
2563                         submit_bh(READ, bh);
2564                         nr_reads++;
2565                 }
2566         }
2567
2568         if (nr_reads) {
2569                 /*
2570                  * The page is locked, so these buffers are protected from
2571                  * any VM or truncate activity.  Hence we don't need to care
2572                  * for the buffer_head refcounts.
2573                  */
2574                 for (bh = head; bh; bh = bh->b_this_page) {
2575                         wait_on_buffer(bh);
2576                         if (!buffer_uptodate(bh))
2577                                 ret = -EIO;
2578                 }
2579                 if (ret)
2580                         goto failed;
2581         }
2582
2583         if (is_mapped_to_disk)
2584                 SetPageMappedToDisk(page);
2585
2586         *fsdata = head; /* to be released by nobh_write_end */
2587
2588         return 0;
2589
2590 failed:
2591         BUG_ON(!ret);
2592         /*
2593          * Error recovery is a bit difficult. We need to zero out blocks that
2594          * were newly allocated, and dirty them to ensure they get written out.
2595          * Buffers need to be attached to the page at this point, otherwise
2596          * the handling of potential IO errors during writeout would be hard
2597          * (could try doing synchronous writeout, but what if that fails too?)
2598          */
2599         attach_nobh_buffers(page, head);
2600         page_zero_new_buffers(page, from, to);
2601
2602 out_release:
2603         unlock_page(page);
2604         page_cache_release(page);
2605         *pagep = NULL;
2606
2607         if (pos + len > inode->i_size)
2608                 vmtruncate(inode, inode->i_size);
2609
2610         return ret;
2611 }
2612 EXPORT_SYMBOL(nobh_write_begin);
2613
2614 int nobh_write_end(struct file *file, struct address_space *mapping,
2615                         loff_t pos, unsigned len, unsigned copied,
2616                         struct page *page, void *fsdata)
2617 {
2618         struct inode *inode = page->mapping->host;
2619         struct buffer_head *head = fsdata;
2620         struct buffer_head *bh;
2621         BUG_ON(fsdata != NULL && page_has_buffers(page));
2622
2623         if (unlikely(copied < len) && head)
2624                 attach_nobh_buffers(page, head);
2625         if (page_has_buffers(page))
2626                 return generic_write_end(file, mapping, pos, len,
2627                                         copied, page, fsdata);
2628
2629         SetPageUptodate(page);
2630         set_page_dirty(page);
2631         if (pos+copied > inode->i_size) {
2632                 i_size_write(inode, pos+copied);
2633                 mark_inode_dirty(inode);
2634         }
2635
2636         unlock_page(page);
2637         page_cache_release(page);
2638
2639         while (head) {
2640                 bh = head;
2641                 head = head->b_this_page;
2642                 free_buffer_head(bh);
2643         }
2644
2645         return copied;
2646 }
2647 EXPORT_SYMBOL(nobh_write_end);
2648
2649 /*
2650  * nobh_writepage() - based on block_full_write_page() except
2651  * that it tries to operate without attaching bufferheads to
2652  * the page.
2653  */
2654 int nobh_writepage(struct page *page, get_block_t *get_block,
2655                         struct writeback_control *wbc)
2656 {
2657         struct inode * const inode = page->mapping->host;
2658         loff_t i_size = i_size_read(inode);
2659         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2660         unsigned offset;
2661         int ret;
2662
2663         /* Is the page fully inside i_size? */
2664         if (page->index < end_index)
2665                 goto out;
2666
2667         /* Is the page fully outside i_size? (truncate in progress) */
2668         offset = i_size & (PAGE_CACHE_SIZE-1);
2669         if (page->index >= end_index+1 || !offset) {
2670                 /*
2671                  * The page may have dirty, unmapped buffers.  For example,
2672                  * they may have been added in ext3_writepage().  Make them
2673                  * freeable here, so the page does not leak.
2674                  */
2675 #if 0
2676                 /* Not really sure about this  - do we need this ? */
2677                 if (page->mapping->a_ops->invalidatepage)
2678                         page->mapping->a_ops->invalidatepage(page, offset);
2679 #endif
2680                 unlock_page(page);
2681                 return 0; /* don't care */
2682         }
2683
2684         /*
2685          * The page straddles i_size.  It must be zeroed out on each and every
2686          * writepage invocation because it may be mmapped.  "A file is mapped
2687          * in multiples of the page size.  For a file that is not a multiple of
2688          * the  page size, the remaining memory is zeroed when mapped, and
2689          * writes to that region are not written out to the file."
2690          */
2691         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2692 out:
2693         ret = mpage_writepage(page, get_block, wbc);
2694         if (ret == -EAGAIN)
2695                 ret = __block_write_full_page(inode, page, get_block, wbc,
2696                                               end_buffer_async_write);
2697         return ret;
2698 }
2699 EXPORT_SYMBOL(nobh_writepage);
2700
2701 int nobh_truncate_page(struct address_space *mapping,
2702                         loff_t from, get_block_t *get_block)
2703 {
2704         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2705         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2706         unsigned blocksize;
2707         sector_t iblock;
2708         unsigned length, pos;
2709         struct inode *inode = mapping->host;
2710         struct page *page;
2711         struct buffer_head map_bh;
2712         int err;
2713
2714         blocksize = 1 << inode->i_blkbits;
2715         length = offset & (blocksize - 1);
2716
2717         /* Block boundary? Nothing to do */
2718         if (!length)
2719                 return 0;
2720
2721         length = blocksize - length;
2722         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2723
2724         page = grab_cache_page(mapping, index);
2725         err = -ENOMEM;
2726         if (!page)
2727                 goto out;
2728
2729         if (page_has_buffers(page)) {
2730 has_buffers:
2731                 unlock_page(page);
2732                 page_cache_release(page);
2733                 return block_truncate_page(mapping, from, get_block);
2734         }
2735
2736         /* Find the buffer that contains "offset" */
2737         pos = blocksize;
2738         while (offset >= pos) {
2739                 iblock++;
2740                 pos += blocksize;
2741         }
2742
2743         map_bh.b_size = blocksize;
2744         map_bh.b_state = 0;
2745         err = get_block(inode, iblock, &map_bh, 0);
2746         if (err)
2747                 goto unlock;
2748         /* unmapped? It's a hole - nothing to do */
2749         if (!buffer_mapped(&map_bh))
2750                 goto unlock;
2751
2752         /* Ok, it's mapped. Make sure it's up-to-date */
2753         if (!PageUptodate(page)) {
2754                 err = mapping->a_ops->readpage(NULL, page);
2755                 if (err) {
2756                         page_cache_release(page);
2757                         goto out;
2758                 }
2759                 lock_page(page);
2760                 if (!PageUptodate(page)) {
2761                         err = -EIO;
2762                         goto unlock;
2763                 }
2764                 if (page_has_buffers(page))
2765                         goto has_buffers;
2766         }
2767         zero_user(page, offset, length);
2768         set_page_dirty(page);
2769         err = 0;
2770
2771 unlock:
2772         unlock_page(page);
2773         page_cache_release(page);
2774 out:
2775         return err;
2776 }
2777 EXPORT_SYMBOL(nobh_truncate_page);
2778
2779 int block_truncate_page(struct address_space *mapping,
2780                         loff_t from, get_block_t *get_block)
2781 {
2782         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2783         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2784         unsigned blocksize;
2785         sector_t iblock;
2786         unsigned length, pos;
2787         struct inode *inode = mapping->host;
2788         struct page *page;
2789         struct buffer_head *bh;
2790         int err;
2791
2792         blocksize = 1 << inode->i_blkbits;
2793         length = offset & (blocksize - 1);
2794
2795         /* Block boundary? Nothing to do */
2796         if (!length)
2797                 return 0;
2798
2799         length = blocksize - length;
2800         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2801
2802         page = grab_cache_page(mapping, index);
2803         err = -ENOMEM;
2804         if (!page)
2805                 goto out;
2806
2807         if (!page_has_buffers(page))
2808                 create_empty_buffers(page, blocksize, 0);
2809
2810         /* Find the buffer that contains "offset" */
2811         bh = page_buffers(page);
2812         pos = blocksize;
2813         while (offset >= pos) {
2814                 bh = bh->b_this_page;
2815                 iblock++;
2816                 pos += blocksize;
2817         }
2818
2819         err = 0;
2820         if (!buffer_mapped(bh)) {
2821                 WARN_ON(bh->b_size != blocksize);
2822                 err = get_block(inode, iblock, bh, 0);
2823                 if (err)
2824                         goto unlock;
2825                 /* unmapped? It's a hole - nothing to do */
2826                 if (!buffer_mapped(bh))
2827                         goto unlock;
2828         }
2829
2830         /* Ok, it's mapped. Make sure it's up-to-date */
2831         if (PageUptodate(page))
2832                 set_buffer_uptodate(bh);
2833
2834         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2835                 err = -EIO;
2836                 ll_rw_block(READ, 1, &bh);
2837                 wait_on_buffer(bh);
2838                 /* Uhhuh. Read error. Complain and punt. */
2839                 if (!buffer_uptodate(bh))
2840                         goto unlock;
2841         }
2842
2843         zero_user(page, offset, length);
2844         mark_buffer_dirty(bh);
2845         err = 0;
2846
2847 unlock:
2848         unlock_page(page);
2849         page_cache_release(page);
2850 out:
2851         return err;
2852 }
2853 EXPORT_SYMBOL(block_truncate_page);
2854
2855 /*
2856  * The generic ->writepage function for buffer-backed address_spaces
2857  * this form passes in the end_io handler used to finish the IO.
2858  */
2859 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2860                         struct writeback_control *wbc, bh_end_io_t *handler)
2861 {
2862         struct inode * const inode = page->mapping->host;
2863         loff_t i_size = i_size_read(inode);
2864         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2865         unsigned offset;
2866
2867         /* Is the page fully inside i_size? */
2868         if (page->index < end_index)
2869                 return __block_write_full_page(inode, page, get_block, wbc,
2870                                                handler);
2871
2872         /* Is the page fully outside i_size? (truncate in progress) */
2873         offset = i_size & (PAGE_CACHE_SIZE-1);
2874         if (page->index >= end_index+1 || !offset) {
2875                 /*
2876                  * The page may have dirty, unmapped buffers.  For example,
2877                  * they may have been added in ext3_writepage().  Make them
2878                  * freeable here, so the page does not leak.
2879                  */
2880                 do_invalidatepage(page, 0);
2881                 unlock_page(page);
2882                 return 0; /* don't care */
2883         }
2884
2885         /*
2886          * The page straddles i_size.  It must be zeroed out on each and every
2887          * writepage invocation because it may be mmapped.  "A file is mapped
2888          * in multiples of the page size.  For a file that is not a multiple of
2889          * the  page size, the remaining memory is zeroed when mapped, and
2890          * writes to that region are not written out to the file."
2891          */
2892         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2893         return __block_write_full_page(inode, page, get_block, wbc, handler);
2894 }
2895 EXPORT_SYMBOL(block_write_full_page_endio);
2896
2897 /*
2898  * The generic ->writepage function for buffer-backed address_spaces
2899  */
2900 int block_write_full_page(struct page *page, get_block_t *get_block,
2901                         struct writeback_control *wbc)
2902 {
2903         return block_write_full_page_endio(page, get_block, wbc,
2904                                            end_buffer_async_write);
2905 }
2906 EXPORT_SYMBOL(block_write_full_page);
2907
2908 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2909                             get_block_t *get_block)
2910 {
2911         struct buffer_head tmp;
2912         struct inode *inode = mapping->host;
2913         tmp.b_state = 0;
2914         tmp.b_blocknr = 0;
2915         tmp.b_size = 1 << inode->i_blkbits;
2916         get_block(inode, block, &tmp, 0);
2917         return tmp.b_blocknr;
2918 }
2919 EXPORT_SYMBOL(generic_block_bmap);
2920
2921 static void end_bio_bh_io_sync(struct bio *bio, int err)
2922 {
2923         struct buffer_head *bh = bio->bi_private;
2924
2925         if (err == -EOPNOTSUPP) {
2926                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2927                 set_bit(BH_Eopnotsupp, &bh->b_state);
2928         }
2929
2930         if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2931                 set_bit(BH_Quiet, &bh->b_state);
2932
2933         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2934         bio_put(bio);
2935 }
2936
2937 int submit_bh(int rw, struct buffer_head * bh)
2938 {
2939         struct bio *bio;
2940         int ret = 0;
2941
2942         BUG_ON(!buffer_locked(bh));
2943         BUG_ON(!buffer_mapped(bh));
2944         BUG_ON(!bh->b_end_io);
2945         BUG_ON(buffer_delay(bh));
2946         BUG_ON(buffer_unwritten(bh));
2947
2948         /*
2949          * Mask in barrier bit for a write (could be either a WRITE or a
2950          * WRITE_SYNC
2951          */
2952         if (buffer_ordered(bh) && (rw & WRITE))
2953                 rw |= WRITE_BARRIER;
2954
2955         /*
2956          * Only clear out a write error when rewriting
2957          */
2958         if (test_set_buffer_req(bh) && (rw & WRITE))
2959                 clear_buffer_write_io_error(bh);
2960
2961         /*
2962          * from here on down, it's all bio -- do the initial mapping,
2963          * submit_bio -> generic_make_request may further map this bio around
2964          */
2965         bio = bio_alloc(GFP_NOIO, 1);
2966
2967         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2968         bio->bi_bdev = bh->b_bdev;
2969         bio->bi_io_vec[0].bv_page = bh->b_page;
2970         bio->bi_io_vec[0].bv_len = bh->b_size;
2971         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2972
2973         bio->bi_vcnt = 1;
2974         bio->bi_idx = 0;
2975         bio->bi_size = bh->b_size;
2976
2977         bio->bi_end_io = end_bio_bh_io_sync;
2978         bio->bi_private = bh;
2979
2980         bio_get(bio);
2981         submit_bio(rw, bio);
2982
2983         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2984                 ret = -EOPNOTSUPP;
2985
2986         bio_put(bio);
2987         return ret;
2988 }
2989 EXPORT_SYMBOL(submit_bh);
2990
2991 /**
2992  * ll_rw_block: low-level access to block devices (DEPRECATED)
2993  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2994  * @nr: number of &struct buffer_heads in the array
2995  * @bhs: array of pointers to &struct buffer_head
2996  *
2997  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2998  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2999  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
3000  * are sent to disk. The fourth %READA option is described in the documentation
3001  * for generic_make_request() which ll_rw_block() calls.
3002  *
3003  * This function drops any buffer that it cannot get a lock on (with the
3004  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
3005  * clean when doing a write request, and any buffer that appears to be
3006  * up-to-date when doing read request.  Further it marks as clean buffers that
3007  * are processed for writing (the buffer cache won't assume that they are
3008  * actually clean until the buffer gets unlocked).
3009  *
3010  * ll_rw_block sets b_end_io to simple completion handler that marks
3011  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3012  * any waiters.
3013  *
3014  * All of the buffers must be for the same device, and must also be a
3015  * multiple of the current approved size for the device.
3016  */
3017 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3018 {
3019         int i;
3020
3021         for (i = 0; i < nr; i++) {
3022                 struct buffer_head *bh = bhs[i];
3023
3024                 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
3025                         lock_buffer(bh);
3026                 else if (!trylock_buffer(bh))
3027                         continue;
3028
3029                 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3030                     rw == SWRITE_SYNC_PLUG) {
3031                         if (test_clear_buffer_dirty(bh)) {
3032                                 bh->b_end_io = end_buffer_write_sync;
3033                                 get_bh(bh);
3034                                 if (rw == SWRITE_SYNC)
3035                                         submit_bh(WRITE_SYNC, bh);
3036                                 else
3037                                         submit_bh(WRITE, bh);
3038                                 continue;
3039                         }
3040                 } else {
3041                         if (!buffer_uptodate(bh)) {
3042                                 bh->b_end_io = end_buffer_read_sync;
3043                                 get_bh(bh);
3044                                 submit_bh(rw, bh);
3045                                 continue;
3046                         }
3047                 }
3048                 unlock_buffer(bh);
3049         }
3050 }
3051 EXPORT_SYMBOL(ll_rw_block);
3052
3053 /*
3054  * For a data-integrity writeout, we need to wait upon any in-progress I/O
3055  * and then start new I/O and then wait upon it.  The caller must have a ref on
3056  * the buffer_head.
3057  */
3058 int sync_dirty_buffer(struct buffer_head *bh)
3059 {
3060         int ret = 0;
3061
3062         WARN_ON(atomic_read(&bh->b_count) < 1);
3063         lock_buffer(bh);
3064         if (test_clear_buffer_dirty(bh)) {
3065                 get_bh(bh);
3066                 bh->b_end_io = end_buffer_write_sync;
3067                 ret = submit_bh(WRITE_SYNC, bh);
3068                 wait_on_buffer(bh);
3069                 if (buffer_eopnotsupp(bh)) {
3070                         clear_buffer_eopnotsupp(bh);
3071                         ret = -EOPNOTSUPP;
3072                 }
3073                 if (!ret && !buffer_uptodate(bh))
3074                         ret = -EIO;
3075         } else {
3076                 unlock_buffer(bh);
3077         }
3078         return ret;
3079 }
3080 EXPORT_SYMBOL(sync_dirty_buffer);
3081
3082 /*
3083  * try_to_free_buffers() checks if all the buffers on this particular page
3084  * are unused, and releases them if so.
3085  *
3086  * Exclusion against try_to_free_buffers may be obtained by either
3087  * locking the page or by holding its mapping's private_lock.
3088  *
3089  * If the page is dirty but all the buffers are clean then we need to
3090  * be sure to mark the page clean as well.  This is because the page
3091  * may be against a block device, and a later reattachment of buffers
3092  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3093  * filesystem data on the same device.
3094  *
3095  * The same applies to regular filesystem pages: if all the buffers are
3096  * clean then we set the page clean and proceed.  To do that, we require
3097  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3098  * private_lock.
3099  *
3100  * try_to_free_buffers() is non-blocking.
3101  */
3102 static inline int buffer_busy(struct buffer_head *bh)
3103 {
3104         return atomic_read(&bh->b_count) |
3105                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3106 }
3107
3108 static int
3109 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3110 {
3111         struct buffer_head *head = page_buffers(page);
3112         struct buffer_head *bh;
3113
3114         bh = head;
3115         do {
3116                 if (buffer_write_io_error(bh) && page->mapping)
3117                         set_bit(AS_EIO, &page->mapping->flags);
3118                 if (buffer_busy(bh))
3119                         goto failed;
3120                 bh = bh->b_this_page;
3121         } while (bh != head);
3122
3123         do {
3124                 struct buffer_head *next = bh->b_this_page;
3125
3126                 if (bh->b_assoc_map)
3127                         __remove_assoc_queue(bh);
3128                 bh = next;
3129         } while (bh != head);
3130         *buffers_to_free = head;
3131         __clear_page_buffers(page);
3132         return 1;
3133 failed:
3134         return 0;
3135 }
3136
3137 int try_to_free_buffers(struct page *page)
3138 {
3139         struct address_space * const mapping = page->mapping;
3140         struct buffer_head *buffers_to_free = NULL;
3141         int ret = 0;
3142
3143         BUG_ON(!PageLocked(page));
3144         if (PageWriteback(page))
3145                 return 0;
3146
3147         if (mapping == NULL) {          /* can this still happen? */
3148                 ret = drop_buffers(page, &buffers_to_free);
3149                 goto out;
3150         }
3151
3152         spin_lock(&mapping->private_lock);
3153         ret = drop_buffers(page, &buffers_to_free);
3154
3155         /*
3156          * If the filesystem writes its buffers by hand (eg ext3)
3157          * then we can have clean buffers against a dirty page.  We
3158          * clean the page here; otherwise the VM will never notice
3159          * that the filesystem did any IO at all.
3160          *
3161          * Also, during truncate, discard_buffer will have marked all
3162          * the page's buffers clean.  We discover that here and clean
3163          * the page also.
3164          *
3165          * private_lock must be held over this entire operation in order
3166          * to synchronise against __set_page_dirty_buffers and prevent the
3167          * dirty bit from being lost.
3168          */
3169         if (ret)
3170                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3171         spin_unlock(&mapping->private_lock);
3172 out:
3173         if (buffers_to_free) {
3174                 struct buffer_head *bh = buffers_to_free;
3175
3176                 do {
3177                         struct buffer_head *next = bh->b_this_page;
3178                         free_buffer_head(bh);
3179                         bh = next;
3180                 } while (bh != buffers_to_free);
3181         }
3182         return ret;
3183 }
3184 EXPORT_SYMBOL(try_to_free_buffers);
3185
3186 void block_sync_page(struct page *page)
3187 {
3188         struct address_space *mapping;
3189
3190         smp_mb();
3191         mapping = page_mapping(page);
3192         if (mapping)
3193                 blk_run_backing_dev(mapping->backing_dev_info, page);
3194 }
3195 EXPORT_SYMBOL(block_sync_page);
3196
3197 /*
3198  * There are no bdflush tunables left.  But distributions are
3199  * still running obsolete flush daemons, so we terminate them here.
3200  *
3201  * Use of bdflush() is deprecated and will be removed in a future kernel.
3202  * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3203  */
3204 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3205 {
3206         static int msg_count;
3207
3208         if (!capable(CAP_SYS_ADMIN))
3209                 return -EPERM;
3210
3211         if (msg_count < 5) {
3212                 msg_count++;
3213                 printk(KERN_INFO
3214                         "warning: process `%s' used the obsolete bdflush"
3215                         " system call\n", current->comm);
3216                 printk(KERN_INFO "Fix your initscripts?\n");
3217         }
3218
3219         if (func == 1)
3220                 do_exit(0);
3221         return 0;
3222 }
3223
3224 /*
3225  * Buffer-head allocation
3226  */
3227 static struct kmem_cache *bh_cachep;
3228
3229 /*
3230  * Once the number of bh's in the machine exceeds this level, we start
3231  * stripping them in writeback.
3232  */
3233 static int max_buffer_heads;
3234
3235 int buffer_heads_over_limit;
3236
3237 struct bh_accounting {
3238         int nr;                 /* Number of live bh's */
3239         int ratelimit;          /* Limit cacheline bouncing */
3240 };
3241
3242 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3243
3244 static void recalc_bh_state(void)
3245 {
3246         int i;
3247         int tot = 0;
3248
3249         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3250                 return;
3251         __get_cpu_var(bh_accounting).ratelimit = 0;
3252         for_each_online_cpu(i)
3253                 tot += per_cpu(bh_accounting, i).nr;
3254         buffer_heads_over_limit = (tot > max_buffer_heads);
3255 }
3256
3257 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3258 {
3259         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3260         if (ret) {
3261                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3262                 get_cpu_var(bh_accounting).nr++;
3263                 recalc_bh_state();
3264                 put_cpu_var(bh_accounting);
3265         }
3266         return ret;
3267 }
3268 EXPORT_SYMBOL(alloc_buffer_head);
3269
3270 void free_buffer_head(struct buffer_head *bh)
3271 {
3272         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3273         kmem_cache_free(bh_cachep, bh);
3274         get_cpu_var(bh_accounting).nr--;
3275         recalc_bh_state();
3276         put_cpu_var(bh_accounting);
3277 }
3278 EXPORT_SYMBOL(free_buffer_head);
3279
3280 static void buffer_exit_cpu(int cpu)
3281 {
3282         int i;
3283         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3284
3285         for (i = 0; i < BH_LRU_SIZE; i++) {
3286                 brelse(b->bhs[i]);
3287                 b->bhs[i] = NULL;
3288         }
3289         get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3290         per_cpu(bh_accounting, cpu).nr = 0;
3291         put_cpu_var(bh_accounting);
3292 }
3293
3294 static int buffer_cpu_notify(struct notifier_block *self,
3295                               unsigned long action, void *hcpu)
3296 {
3297         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3298                 buffer_exit_cpu((unsigned long)hcpu);
3299         return NOTIFY_OK;
3300 }
3301
3302 /**
3303  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3304  * @bh: struct buffer_head
3305  *
3306  * Return true if the buffer is up-to-date and false,
3307  * with the buffer locked, if not.
3308  */
3309 int bh_uptodate_or_lock(struct buffer_head *bh)
3310 {
3311         if (!buffer_uptodate(bh)) {
3312                 lock_buffer(bh);
3313                 if (!buffer_uptodate(bh))
3314                         return 0;
3315                 unlock_buffer(bh);
3316         }
3317         return 1;
3318 }
3319 EXPORT_SYMBOL(bh_uptodate_or_lock);
3320
3321 /**
3322  * bh_submit_read - Submit a locked buffer for reading
3323  * @bh: struct buffer_head
3324  *
3325  * Returns zero on success and -EIO on error.
3326  */
3327 int bh_submit_read(struct buffer_head *bh)
3328 {
3329         BUG_ON(!buffer_locked(bh));
3330
3331         if (buffer_uptodate(bh)) {
3332                 unlock_buffer(bh);
3333                 return 0;
3334         }
3335
3336         get_bh(bh);
3337         bh->b_end_io = end_buffer_read_sync;
3338         submit_bh(READ, bh);
3339         wait_on_buffer(bh);
3340         if (buffer_uptodate(bh))
3341                 return 0;
3342         return -EIO;
3343 }
3344 EXPORT_SYMBOL(bh_submit_read);
3345
3346 void __init buffer_init(void)
3347 {
3348         int nrpages;
3349
3350         bh_cachep = kmem_cache_create("buffer_head",
3351                         sizeof(struct buffer_head), 0,
3352                                 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3353                                 SLAB_MEM_SPREAD),
3354                                 NULL);
3355
3356         /*
3357          * Limit the bh occupancy to 10% of ZONE_NORMAL
3358          */
3359         nrpages = (nr_free_buffer_pages() * 10) / 100;
3360         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3361         hotcpu_notifier(buffer_cpu_notify, 0);
3362 }