fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
   5  */
   6
   7 /*
   8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
   9  *
  10  * Removed a lot of unnecessary code and simplified things now that
  11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12  *
  13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
  14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
  15  *
  16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17  *
  18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19  */
  20
  21 #include <linux/config.h>
  22 #include <linux/kernel.h>
  23 #include <linux/fs.h>
  24 #include <linux/mm.h>
  25 #include <linux/percpu.h>
  26 #include <linux/slab.h>
  27 #include <linux/smp_lock.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/file.h>
  30 #include <linux/quotaops.h>
  31 #include <linux/highmem.h>
  32 #include <linux/module.h>
  33 #include <linux/writeback.h>
  34 #include <linux/hash.h>
  35 #include <linux/suspend.h>
  36 #include <linux/buffer_head.h>
  37 #include <linux/bio.h>
  38 #include <linux/notifier.h>
  39 #include <linux/cpu.h>
  40 #include <asm/bitops.h>
  41
  42 static void invalidate_bh_lrus(void);
  43
  44 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  45
  46 /*
  47  * Hashed waitqueue_head's for wait_on_buffer()
  48  */
  49 #define BH_WAIT_TABLE_ORDER     7
  50 static struct bh_wait_queue_head {
  51         wait_queue_head_t wqh;
  52 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
  53
  54 /*
  55  * Debug/devel support stuff
  56  */
  57
  58 void __buffer_error(char *file, int line)
  59 {
  60         static int enough;
  61
  62         if (enough > 10)
  63                 return;
  64         enough++;
  65         printk("buffer layer error at %s:%d\n", file, line);
  66 #ifndef CONFIG_KALLSYMS
  67         printk("Pass this trace through ksymoops for reporting\n");
  68 #endif
  69         dump_stack();
  70 }
  71 EXPORT_SYMBOL(__buffer_error);
  72
  73 inline void
  74 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
  75 {
  76         bh->b_end_io = handler;
  77         bh->b_private = private;
  78 }
  79
  80 /*
  81  * Return the address of the waitqueue_head to be used for this
  82  * buffer_head
  83  */
  84 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
  85 {
  86         return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
  87 }
  88 EXPORT_SYMBOL(bh_waitq_head);
  89
  90 void wake_up_buffer(struct buffer_head *bh)
  91 {
  92         wait_queue_head_t *wq = bh_waitq_head(bh);
  93
  94         if (waitqueue_active(wq))
  95                 wake_up_all(wq);
  96 }
  97 EXPORT_SYMBOL(wake_up_buffer);
  98
  99 void unlock_buffer(struct buffer_head *bh)
 100 {
 101         /*
 102          * unlock_buffer against a zero-count bh is a bug, if the page
 103          * is not locked.  Because then nothing protects the buffer's
 104          * waitqueue, which is used here. (Well.  Other locked buffers
 105          * against the page will pin it.  But complain anyway).
 106          */
 107         if (atomic_read(&bh->b_count) == 0 &&
 108                         !PageLocked(bh->b_page) &&
 109                         !PageWriteback(bh->b_page))
 110                 buffer_error();
 111
 112         clear_buffer_locked(bh);
 113         smp_mb__after_clear_bit();
 114         wake_up_buffer(bh);
 115 }
 116
 117 /*
 118  * Block until a buffer comes unlocked.  This doesn't stop it
 119  * from becoming locked again - you have to lock it yourself
 120  * if you want to preserve its state.
 121  */
 122 void __wait_on_buffer(struct buffer_head * bh)
 123 {
 124         wait_queue_head_t *wqh = bh_waitq_head(bh);
 125         DEFINE_WAIT(wait);
 126
 127         if (atomic_read(&bh->b_count) == 0 &&
 128                         (!bh->b_page || !PageLocked(bh->b_page)))
 129                 buffer_error();
 130
 131         do {
 132                 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 133                 if (buffer_locked(bh)) {
 134                         blk_run_queues();
 135                         io_schedule();
 136                 }
 137         } while (buffer_locked(bh));
 138         finish_wait(wqh, &wait);
 139 }
 140
 141 static void
 142 __set_page_buffers(struct page *page, struct buffer_head *head)
 143 {
 144         if (page_has_buffers(page))
 145                 buffer_error();
 146         page_cache_get(page);
 147         SetPagePrivate(page);
 148         page->private = (unsigned long)head;
 149 }
 150
 151 static void
 152 __clear_page_buffers(struct page *page)
 153 {
 154         ClearPagePrivate(page);
 155         page->private = 0;
 156         page_cache_release(page);
 157 }
 158
 159 static void buffer_io_error(struct buffer_head *bh)
 160 {
 161         char b[BDEVNAME_SIZE];
 162
 163         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 164                         bdevname(bh->b_bdev, b),
 165                         (unsigned long long)bh->b_blocknr);
 166 }
 167
 168 /*
 169  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 170  * unlock the buffer. This is what ll_rw_block uses too.
 171  */
 172 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 173 {
 174         if (uptodate) {
 175                 set_buffer_uptodate(bh);
 176         } else {
 177                 /*
 178                  * This happens, due to failed READA attempts.
 179                  * buffer_io_error(bh);
 180                  */
 181                 clear_buffer_uptodate(bh);
 182         }
 183         unlock_buffer(bh);
 184         put_bh(bh);
 185 }
 186
 187 /*
 188  * Write out and wait upon all the dirty data associated with a block
 189  * device via its mapping.  Does not take the superblock lock.
 190  */
 191 int sync_blockdev(struct block_device *bdev)
 192 {
 193         int ret = 0;
 194
 195         if (bdev) {
 196                 int err;
 197
 198                 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
 199                 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
 200                 if (!ret)
 201                         ret = err;
 202         }
 203         return ret;
 204 }
 205 EXPORT_SYMBOL(sync_blockdev);
 206
 207 /*
 208  * Write out and wait upon all dirty data associated with this
 209  * superblock.  Filesystem data as well as the underlying block
 210  * device.  Takes the superblock lock.
 211  */
 212 int fsync_super(struct super_block *sb)
 213 {
 214         sync_inodes_sb(sb, 0);
 215         DQUOT_SYNC(sb);
 216         lock_super(sb);
 217         if (sb->s_dirt && sb->s_op->write_super)
 218                 sb->s_op->write_super(sb);
 219         unlock_super(sb);
 220         if (sb->s_op->sync_fs)
 221                 sb->s_op->sync_fs(sb, 1);
 222         sync_blockdev(sb->s_bdev);
 223         sync_inodes_sb(sb, 1);
 224
 225         return sync_blockdev(sb->s_bdev);
 226 }
 227
 228 /*
 229  * Write out and wait upon all dirty data associated with this
 230  * device.   Filesystem data as well as the underlying block
 231  * device.  Takes the superblock lock.
 232  */
 233 int fsync_bdev(struct block_device *bdev)
 234 {
 235         struct super_block *sb = get_super(bdev);
 236         if (sb) {
 237                 int res = fsync_super(sb);
 238                 drop_super(sb);
 239                 return res;
 240         }
 241         return sync_blockdev(bdev);
 242 }
 243
 244 /*
 245  * sync everything.  Start out by waking pdflush, because that writes back
 246  * all queues in parallel.
 247  */
 248 static void do_sync(unsigned long wait)
 249 {
 250         wakeup_bdflush(0);
 251         sync_inodes(0);         /* All mappings, inodes and their blockdevs */
 252         DQUOT_SYNC(NULL);
 253         sync_supers();          /* Write the superblocks */
 254         sync_filesystems(0);    /* Start syncing the filesystems */
 255         sync_filesystems(wait); /* Waitingly sync the filesystems */
 256         sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
 257         if (!wait)
 258                 printk("Emergency Sync complete\n");
 259 }
 260
 261 asmlinkage long sys_sync(void)
 262 {
 263         do_sync(1);
 264         return 0;
 265 }
 266
 267 void emergency_sync(void)
 268 {
 269         pdflush_operation(do_sync, 0);
 270 }
 271
 272 /*
 273  * Generic function to fsync a file.
 274  *
 275  * filp may be NULL if called via the msync of a vma.
 276  */
 277
 278 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 279 {
 280         struct inode * inode = dentry->d_inode;
 281         struct super_block * sb;
 282         int ret;
 283
 284         /* sync the inode to buffers */
 285         write_inode_now(inode, 0);
 286
 287         /* sync the superblock to buffers */
 288         sb = inode->i_sb;
 289         lock_super(sb);
 290         if (sb->s_op->write_super)
 291                 sb->s_op->write_super(sb);
 292         unlock_super(sb);
 293
 294         /* .. finally sync the buffers to disk */
 295         ret = sync_blockdev(sb->s_bdev);
 296         return ret;
 297 }
 298
 299 asmlinkage long sys_fsync(unsigned int fd)
 300 {
 301         struct file * file;
 302         struct dentry * dentry;
 303         struct inode * inode;
 304         int ret, err;
 305
 306         ret = -EBADF;
 307         file = fget(fd);
 308         if (!file)
 309                 goto out;
 310
 311         dentry = file->f_dentry;
 312         inode = dentry->d_inode;
 313
 314         ret = -EINVAL;
 315         if (!file->f_op || !file->f_op->fsync) {
 316                 /* Why?  We can still call filemap_fdatawrite */
 317                 goto out_putf;
 318         }
 319
 320         /* We need to protect against concurrent writers.. */
 321         down(&inode->i_sem);
 322         current->flags |= PF_SYNCWRITE;
 323         ret = filemap_fdatawrite(inode->i_mapping);
 324         err = file->f_op->fsync(file, dentry, 0);
 325         if (!ret)
 326                 ret = err;
 327         err = filemap_fdatawait(inode->i_mapping);
 328         if (!ret)
 329                 ret = err;
 330         current->flags &= ~PF_SYNCWRITE;
 331         up(&inode->i_sem);
 332
 333 out_putf:
 334         fput(file);
 335 out:
 336         return ret;
 337 }
 338
 339 asmlinkage long sys_fdatasync(unsigned int fd)
 340 {
 341         struct file * file;
 342         struct dentry * dentry;
 343         struct inode * inode;
 344         int ret, err;
 345
 346         ret = -EBADF;
 347         file = fget(fd);
 348         if (!file)
 349                 goto out;
 350
 351         dentry = file->f_dentry;
 352         inode = dentry->d_inode;
 353
 354         ret = -EINVAL;
 355         if (!file->f_op || !file->f_op->fsync)
 356                 goto out_putf;
 357
 358         down(&inode->i_sem);
 359         current->flags |= PF_SYNCWRITE;
 360         ret = filemap_fdatawrite(inode->i_mapping);
 361         err = file->f_op->fsync(file, dentry, 1);
 362         if (!ret)
 363                 ret = err;
 364         err = filemap_fdatawait(inode->i_mapping);
 365         if (!ret)
 366                 ret = err;
 367         current->flags &= ~PF_SYNCWRITE;
 368         up(&inode->i_sem);
 369
 370 out_putf:
 371         fput(file);
 372 out:
 373         return ret;
 374 }
 375
 376 /*
 377  * Various filesystems appear to want __find_get_block to be non-blocking.
 378  * But it's the page lock which protects the buffers.  To get around this,
 379  * we get exclusion from try_to_free_buffers with the blockdev mapping's
 380  * private_lock.
 381  *
 382  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
 383  * may be quite high.  This code could TryLock the page, and if that
 384  * succeeds, there is no need to take private_lock. (But if
 385  * private_lock is contended then so is mapping->page_lock).
 386  */
 387 static struct buffer_head *
 388 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
 389 {
 390         struct inode *bd_inode = bdev->bd_inode;
 391         struct address_space *bd_mapping = bd_inode->i_mapping;
 392         struct buffer_head *ret = NULL;
 393         unsigned long index;
 394         struct buffer_head *bh;
 395         struct buffer_head *head;
 396         struct page *page;
 397
 398         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
 399         page = find_get_page(bd_mapping, index);
 400         if (!page)
 401                 goto out;
 402
 403         spin_lock(&bd_mapping->private_lock);
 404         if (!page_has_buffers(page))
 405                 goto out_unlock;
 406         head = page_buffers(page);
 407         bh = head;
 408         do {
 409                 if (bh->b_blocknr == block) {
 410                         ret = bh;
 411                         get_bh(bh);
 412                         goto out_unlock;
 413                 }
 414                 bh = bh->b_this_page;
 415         } while (bh != head);
 416         buffer_error();
 417 out_unlock:
 418         spin_unlock(&bd_mapping->private_lock);
 419         page_cache_release(page);
 420 out:
 421         return ret;
 422 }
 423
 424 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 425    of fs corruption is going on. Trashing dirty data always imply losing
 426    information that was supposed to be just stored on the physical layer
 427    by the user.
 428
 429    Thus invalidate_buffers in general usage is not allwowed to trash
 430    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
 431    be preserved.  These buffers are simply skipped.
 432
 433    We also skip buffers which are still in use.  For example this can
 434    happen if a userspace program is reading the block device.
 435
 436    NOTE: In the case where the user removed a removable-media-disk even if
 437    there's still dirty data not synced on disk (due a bug in the device driver
 438    or due an error of the user), by not destroying the dirty buffers we could
 439    generate corruption also on the next media inserted, thus a parameter is
 440    necessary to handle this case in the most safe way possible (trying
 441    to not corrupt also the new disk inserted with the data belonging to
 442    the old now corrupted disk). Also for the ramdisk the natural thing
 443    to do in order to release the ramdisk memory is to destroy dirty buffers.
 444
 445    These are two special cases. Normal usage imply the device driver
 446    to issue a sync on the device (without waiting I/O completion) and
 447    then an invalidate_buffers call that doesn't trash dirty buffers.
 448
 449    For handling cache coherency with the blkdev pagecache the 'update' case
 450    is been introduced. It is needed to re-read from disk any pinned
 451    buffer. NOTE: re-reading from disk is destructive so we can do it only
 452    when we assume nobody is changing the buffercache under our I/O and when
 453    we think the disk contains more recent information than the buffercache.
 454    The update == 1 pass marks the buffers we need to update, the update == 2
 455    pass does the actual I/O. */
 456 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 457 {
 458         invalidate_bh_lrus();
 459         /*
 460          * FIXME: what about destroy_dirty_buffers?
 461          * We really want to use invalidate_inode_pages2() for
 462          * that, but not until that's cleaned up.
 463          */
 464         invalidate_inode_pages(bdev->bd_inode->i_mapping);
 465 }
 466
 467 /*
 468  * Kick pdflush then try to free up some ZONE_NORMAL memory.
 469  */
 470 static void free_more_memory(void)
 471 {
 472         struct zone *zone;
 473         pg_data_t *pgdat;
 474
 475         wakeup_bdflush(1024);
 476         blk_run_queues();
 477         yield();
 478
 479         for_each_pgdat(pgdat) {
 480                 zone = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
 481                 if (zone)
 482                         try_to_free_pages(zone, GFP_NOFS, 0);
 483         }
 484 }
 485
 486 /*
 487  * I/O completion handler for block_read_full_page() - pages
 488  * which come unlocked at the end of I/O.
 489  */
 490 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 491 {
 492         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 493         unsigned long flags;
 494         struct buffer_head *tmp;
 495         struct page *page;
 496         int page_uptodate = 1;
 497
 498         BUG_ON(!buffer_async_read(bh));
 499
 500         page = bh->b_page;
 501         if (uptodate) {
 502                 set_buffer_uptodate(bh);
 503         } else {
 504                 clear_buffer_uptodate(bh);
 505                 buffer_io_error(bh);
 506                 SetPageError(page);
 507         }
 508
 509         /*
 510          * Be _very_ careful from here on. Bad things can happen if
 511          * two buffer heads end IO at almost the same time and both
 512          * decide that the page is now completely done.
 513          */
 514         spin_lock_irqsave(&page_uptodate_lock, flags);
 515         clear_buffer_async_read(bh);
 516         unlock_buffer(bh);
 517         tmp = bh;
 518         do {
 519                 if (!buffer_uptodate(tmp))
 520                         page_uptodate = 0;
 521                 if (buffer_async_read(tmp)) {
 522                         BUG_ON(!buffer_locked(tmp));
 523                         goto still_busy;
 524                 }
 525                 tmp = tmp->b_this_page;
 526         } while (tmp != bh);
 527         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 528
 529         /*
 530          * If none of the buffers had errors and they are all
 531          * uptodate then we can set the page uptodate.
 532          */
 533         if (page_uptodate && !PageError(page))
 534                 SetPageUptodate(page);
 535         unlock_page(page);
 536         return;
 537
 538 still_busy:
 539         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 540         return;
 541 }
 542
 543 /*
 544  * Completion handler for block_write_full_page() - pages which are unlocked
 545  * during I/O, and which have PageWriteback cleared upon I/O completion.
 546  */
 547 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 548 {
 549         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 550         unsigned long flags;
 551         struct buffer_head *tmp;
 552         struct page *page;
 553
 554         BUG_ON(!buffer_async_write(bh));
 555
 556         page = bh->b_page;
 557         if (uptodate) {
 558                 set_buffer_uptodate(bh);
 559         } else {
 560                 buffer_io_error(bh);
 561                 clear_buffer_uptodate(bh);
 562                 SetPageError(page);
 563         }
 564
 565         spin_lock_irqsave(&page_uptodate_lock, flags);
 566         clear_buffer_async_write(bh);
 567         unlock_buffer(bh);
 568         tmp = bh->b_this_page;
 569         while (tmp != bh) {
 570                 if (buffer_async_write(tmp)) {
 571                         BUG_ON(!buffer_locked(tmp));
 572                         goto still_busy;
 573                 }
 574                 tmp = tmp->b_this_page;
 575         }
 576         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 577         end_page_writeback(page);
 578         return;
 579
 580 still_busy:
 581         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 582         return;
 583 }
 584
 585 /*
 586  * If a page's buffers are under async readin (end_buffer_async_read
 587  * completion) then there is a possibility that another thread of
 588  * control could lock one of the buffers after it has completed
 589  * but while some of the other buffers have not completed.  This
 590  * locked buffer would confuse end_buffer_async_read() into not unlocking
 591  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 592  * that this buffer is not under async I/O.
 593  *
 594  * The page comes unlocked when it has no locked buffer_async buffers
 595  * left.
 596  *
 597  * PageLocked prevents anyone starting new async I/O reads any of
 598  * the buffers.
 599  *
 600  * PageWriteback is used to prevent simultaneous writeout of the same
 601  * page.
 602  *
 603  * PageLocked prevents anyone from starting writeback of a page which is
 604  * under read I/O (PageWriteback is only ever set against a locked page).
 605  */
 606 void mark_buffer_async_read(struct buffer_head *bh)
 607 {
 608         bh->b_end_io = end_buffer_async_read;
 609         set_buffer_async_read(bh);
 610 }
 611 EXPORT_SYMBOL(mark_buffer_async_read);
 612
 613 void mark_buffer_async_write(struct buffer_head *bh)
 614 {
 615         bh->b_end_io = end_buffer_async_write;
 616         set_buffer_async_write(bh);
 617 }
 618 EXPORT_SYMBOL(mark_buffer_async_write);
 619
 620
 621 /*
 622  * fs/buffer.c contains helper functions for buffer-backed address space's
 623  * fsync functions.  A common requirement for buffer-based filesystems is
 624  * that certain data from the backing blockdev needs to be written out for
 625  * a successful fsync().  For example, ext2 indirect blocks need to be
 626  * written back and waited upon before fsync() returns.
 627  *
 628  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
 629  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 630  * management of a list of dependent buffers at ->i_mapping->private_list.
 631  *
 632  * Locking is a little subtle: try_to_free_buffers() will remove buffers
 633  * from their controlling inode's queue when they are being freed.  But
 634  * try_to_free_buffers() will be operating against the *blockdev* mapping
 635  * at the time, not against the S_ISREG file which depends on those buffers.
 636  * So the locking for private_list is via the private_lock in the address_space
 637  * which backs the buffers.  Which is different from the address_space
 638  * against which the buffers are listed.  So for a particular address_space,
 639  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
 640  * mapping->private_list will always be protected by the backing blockdev's
 641  * ->private_lock.
 642  *
 643  * Which introduces a requirement: all buffers on an address_space's
 644  * ->private_list must be from the same address_space: the blockdev's.
 645  *
 646  * address_spaces which do not place buffers at ->private_list via these
 647  * utility functions are free to use private_lock and private_list for
 648  * whatever they want.  The only requirement is that list_empty(private_list)
 649  * be true at clear_inode() time.
 650  *
 651  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 652  * filesystems should do that.  invalidate_inode_buffers() should just go
 653  * BUG_ON(!list_empty).
 654  *
 655  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 656  * take an address_space, not an inode.  And it should be called
 657  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 658  * queued up.
 659  *
 660  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 661  * list if it is already on a list.  Because if the buffer is on a list,
 662  * it *must* already be on the right one.  If not, the filesystem is being
 663  * silly.  This will save a ton of locking.  But first we have to ensure
 664  * that buffers are taken *off* the old inode's list when they are freed
 665  * (presumably in truncate).  That requires careful auditing of all
 666  * filesystems (do it inside bforget()).  It could also be done by bringing
 667  * b_inode back.
 668  */
 669
 670 void buffer_insert_list(spinlock_t *lock,
 671                 struct buffer_head *bh, struct list_head *list)
 672 {
 673         spin_lock(lock);
 674         list_move_tail(&bh->b_assoc_buffers, list);
 675         spin_unlock(lock);
 676 }
 677
 678 /*
 679  * The buffer's backing address_space's private_lock must be held
 680  */
 681 static inline void __remove_assoc_queue(struct buffer_head *bh)
 682 {
 683         list_del_init(&bh->b_assoc_buffers);
 684 }
 685
 686 int inode_has_buffers(struct inode *inode)
 687 {
 688         return !list_empty(&inode->i_data.private_list);
 689 }
 690
 691 /*
 692  * osync is designed to support O_SYNC io.  It waits synchronously for
 693  * all already-submitted IO to complete, but does not queue any new
 694  * writes to the disk.
 695  *
 696  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
 697  * you dirty the buffers, and then use osync_inode_buffers to wait for
 698  * completion.  Any other dirty buffers which are not yet queued for
 699  * write will not be flushed to disk by the osync.
 700  */
 701 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
 702 {
 703         struct buffer_head *bh;
 704         struct list_head *p;
 705         int err = 0;
 706
 707         spin_lock(lock);
 708 repeat:
 709         list_for_each_prev(p, list) {
 710                 bh = BH_ENTRY(p);
 711                 if (buffer_locked(bh)) {
 712                         get_bh(bh);
 713                         spin_unlock(lock);
 714                         wait_on_buffer(bh);
 715                         if (!buffer_uptodate(bh))
 716                                 err = -EIO;
 717                         brelse(bh);
 718                         spin_lock(lock);
 719                         goto repeat;
 720                 }
 721         }
 722         spin_unlock(lock);
 723         return err;
 724 }
 725
 726 /**
 727  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
 728  *                        buffers
 729  * @buffer_mapping - the mapping which backs the buffers' data
 730  * @mapping - the mapping which wants those buffers written
 731  *
 732  * Starts I/O against the buffers at mapping->private_list, and waits upon
 733  * that I/O.
 734  *
 735  * Basically, this is a convenience function for fsync().  @buffer_mapping is
 736  * the blockdev which "owns" the buffers and @mapping is a file or directory
 737  * which needs those buffers to be written for a successful fsync().
 738  */
 739 int sync_mapping_buffers(struct address_space *mapping)
 740 {
 741         struct address_space *buffer_mapping = mapping->assoc_mapping;
 742
 743         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
 744                 return 0;
 745
 746         return fsync_buffers_list(&buffer_mapping->private_lock,
 747                                         &mapping->private_list);
 748 }
 749 EXPORT_SYMBOL(sync_mapping_buffers);
 750
 751 /*
 752  * Called when we've recently written block `bblock', and it is known that
 753  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 754  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 755  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 756  */
 757 void write_boundary_block(struct block_device *bdev,
 758                         sector_t bblock, unsigned blocksize)
 759 {
 760         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
 761         if (bh) {
 762                 if (buffer_dirty(bh))
 763                         ll_rw_block(WRITE, 1, &bh);
 764                 put_bh(bh);
 765         }
 766 }
 767
 768 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 769 {
 770         struct address_space *mapping = inode->i_mapping;
 771         struct address_space *buffer_mapping = bh->b_page->mapping;
 772
 773         mark_buffer_dirty(bh);
 774         if (!mapping->assoc_mapping) {
 775                 mapping->assoc_mapping = buffer_mapping;
 776         } else {
 777                 if (mapping->assoc_mapping != buffer_mapping)
 778                         BUG();
 779         }
 780         if (list_empty(&bh->b_assoc_buffers))
 781                 buffer_insert_list(&buffer_mapping->private_lock,
 782                                 bh, &mapping->private_list);
 783 }
 784 EXPORT_SYMBOL(mark_buffer_dirty_inode);
 785
 786 /*
 787  * Add a page to the dirty page list.
 788  *
 789  * It is a sad fact of life that this function is called from several places
 790  * deeply under spinlocking.  It may not sleep.
 791  *
 792  * If the page has buffers, the uptodate buffers are set dirty, to preserve
 793  * dirty-state coherency between the page and the buffers.  It the page does
 794  * not have buffers then when they are later attached they will all be set
 795  * dirty.
 796  *
 797  * The buffers are dirtied before the page is dirtied.  There's a small race
 798  * window in which a writepage caller may see the page cleanness but not the
 799  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
 800  * before the buffers, a concurrent writepage caller could clear the page dirty
 801  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
 802  * page on the dirty page list.
 803  *
 804  * There is also a small window where the page is dirty, and not on dirty_pages.
 805  * Also a possibility that by the time the page is added to dirty_pages, it has
 806  * been set clean.  The page lists are somewhat approximate in this regard.
 807  * It's better to have clean pages accidentally attached to dirty_pages than to
 808  * leave dirty pages attached to clean_pages.
 809  *
 810  * We use private_lock to lock against try_to_free_buffers while using the
 811  * page's buffer list.  Also use this to protect against clean buffers being
 812  * added to the page after it was set dirty.
 813  *
 814  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
 815  * address_space though.
 816  *
 817  * For now, we treat swapper_space specially.  It doesn't use the normal
 818  * block a_ops.
 819  */
 820 int __set_page_dirty_buffers(struct page *page)
 821 {
 822         struct address_space * const mapping = page->mapping;
 823         int ret = 0;
 824
 825         if (mapping == NULL) {
 826                 SetPageDirty(page);
 827                 goto out;
 828         }
 829
 830         spin_lock(&mapping->private_lock);
 831         if (page_has_buffers(page)) {
 832                 struct buffer_head *head = page_buffers(page);
 833                 struct buffer_head *bh = head;
 834
 835                 do {
 836                         if (buffer_uptodate(bh))
 837                                 set_buffer_dirty(bh);
 838                         else
 839                                 buffer_error();
 840                         bh = bh->b_this_page;
 841                 } while (bh != head);
 842         }
 843         spin_unlock(&mapping->private_lock);
 844
 845         if (!TestSetPageDirty(page)) {
 846                 spin_lock(&mapping->page_lock);
 847                 if (page->mapping) {    /* Race with truncate? */
 848                         if (!mapping->backing_dev_info->memory_backed)
 849                                 inc_page_state(nr_dirty);
 850                         list_del(&page->list);
 851                         list_add(&page->list, &mapping->dirty_pages);
 852                 }
 853                 spin_unlock(&mapping->page_lock);
 854                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 855         }
 856
 857 out:
 858         return ret;
 859 }
 860 EXPORT_SYMBOL(__set_page_dirty_buffers);
 861
 862 /*
 863  * Write out and wait upon a list of buffers.
 864  *
 865  * We have conflicting pressures: we want to make sure that all
 866  * initially dirty buffers get waited on, but that any subsequently
 867  * dirtied buffers don't.  After all, we don't want fsync to last
 868  * forever if somebody is actively writing to the file.
 869  *
 870  * Do this in two main stages: first we copy dirty buffers to a
 871  * temporary inode list, queueing the writes as we go.  Then we clean
 872  * up, waiting for those writes to complete.
 873  *
 874  * During this second stage, any subsequent updates to the file may end
 875  * up refiling the buffer on the original inode's dirty list again, so
 876  * there is a chance we will end up with a buffer queued for write but
 877  * not yet completed on that list.  So, as a final cleanup we go through
 878  * the osync code to catch these locked, dirty buffers without requeuing
 879  * any newly dirty buffers for write.
 880  */
 881 int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 882 {
 883         struct buffer_head *bh;
 884         struct list_head tmp;
 885         int err = 0, err2;
 886
 887         INIT_LIST_HEAD(&tmp);
 888
 889         spin_lock(lock);
 890         while (!list_empty(list)) {
 891                 bh = BH_ENTRY(list->next);
 892                 list_del_init(&bh->b_assoc_buffers);
 893                 if (buffer_dirty(bh) || buffer_locked(bh)) {
 894                         list_add(&bh->b_assoc_buffers, &tmp);
 895                         if (buffer_dirty(bh)) {
 896                                 get_bh(bh);
 897                                 spin_unlock(lock);
 898                                 /*
 899                                  * Ensure any pending I/O completes so that
 900                                  * ll_rw_block() actually writes the current
 901                                  * contents - it is a noop if I/O is still in
 902                                  * flight on potentially older contents.
 903                                  */
 904                                 wait_on_buffer(bh);
 905                                 ll_rw_block(WRITE, 1, &bh);
 906                                 brelse(bh);
 907                                 spin_lock(lock);
 908                         }
 909                 }
 910         }
 911
 912         while (!list_empty(&tmp)) {
 913                 bh = BH_ENTRY(tmp.prev);
 914                 __remove_assoc_queue(bh);
 915                 get_bh(bh);
 916                 spin_unlock(lock);
 917                 wait_on_buffer(bh);
 918                 if (!buffer_uptodate(bh))
 919                         err = -EIO;
 920                 brelse(bh);
 921                 spin_lock(lock);
 922         }
 923
 924         spin_unlock(lock);
 925         err2 = osync_buffers_list(lock, list);
 926         if (err)
 927                 return err;
 928         else
 929                 return err2;
 930 }
 931
 932 /*
 933  * Invalidate any and all dirty buffers on a given inode.  We are
 934  * probably unmounting the fs, but that doesn't mean we have already
 935  * done a sync().  Just drop the buffers from the inode list.
 936  *
 937  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
 938  * assumes that all the buffers are against the blockdev.  Not true
 939  * for reiserfs.
 940  */
 941 void invalidate_inode_buffers(struct inode *inode)
 942 {
 943         if (inode_has_buffers(inode)) {
 944                 struct address_space *mapping = &inode->i_data;
 945                 struct list_head *list = &mapping->private_list;
 946                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 947
 948                 spin_lock(&buffer_mapping->private_lock);
 949                 while (!list_empty(list))
 950                         __remove_assoc_queue(BH_ENTRY(list->next));
 951                 spin_unlock(&buffer_mapping->private_lock);
 952         }
 953 }
 954
 955 /*
 956  * Remove any clean buffers from the inode's buffer list.  This is called
 957  * when we're trying to free the inode itself.  Those buffers can pin it.
 958  *
 959  * Returns true if all buffers were removed.
 960  */
 961 int remove_inode_buffers(struct inode *inode)
 962 {
 963         int ret = 1;
 964
 965         if (inode_has_buffers(inode)) {
 966                 struct address_space *mapping = &inode->i_data;
 967                 struct list_head *list = &mapping->private_list;
 968                 struct address_space *buffer_mapping = mapping->assoc_mapping;
 969
 970                 spin_lock(&buffer_mapping->private_lock);
 971                 while (!list_empty(list)) {
 972                         struct buffer_head *bh = BH_ENTRY(list->next);
 973                         if (buffer_dirty(bh)) {
 974                                 ret = 0;
 975                                 break;
 976                         }
 977                         __remove_assoc_queue(bh);
 978                 }
 979                 spin_unlock(&buffer_mapping->private_lock);
 980         }
 981         return ret;
 982 }
 983
 984 /*
 985  * Create the appropriate buffers when given a page for data area and
 986  * the size of each buffer.. Use the bh->b_this_page linked list to
 987  * follow the buffers created.  Return NULL if unable to create more
 988  * buffers.
 989  *
 990  * The retry flag is used to differentiate async IO (paging, swapping)
 991  * which may not fail from ordinary buffer allocations.
 992  */
 993 static struct buffer_head *
 994 create_buffers(struct page * page, unsigned long size, int retry)
 995 {
 996         struct buffer_head *bh, *head;
 997         long offset;
 998
 999 try_again:
1000         head = NULL;
1001         offset = PAGE_SIZE;
1002         while ((offset -= size) >= 0) {
1003                 bh = alloc_buffer_head(GFP_NOFS);
1004                 if (!bh)
1005                         goto no_grow;
1006
1007                 bh->b_bdev = NULL;
1008                 bh->b_this_page = head;
1009                 bh->b_blocknr = -1;
1010                 head = bh;
1011
1012                 bh->b_state = 0;
1013                 atomic_set(&bh->b_count, 0);
1014                 bh->b_size = size;
1015
1016                 /* Link the buffer to its page */
1017                 set_bh_page(bh, page, offset);
1018
1019                 bh->b_end_io = NULL;
1020         }
1021         return head;
1022 /*
1023  * In case anything failed, we just free everything we got.
1024  */
1025 no_grow:
1026         if (head) {
1027                 do {
1028                         bh = head;
1029                         head = head->b_this_page;
1030                         free_buffer_head(bh);
1031                 } while (head);
1032         }
1033
1034         /*
1035          * Return failure for non-async IO requests.  Async IO requests
1036          * are not allowed to fail, so we have to wait until buffer heads
1037          * become available.  But we don't want tasks sleeping with
1038          * partially complete buffers, so all were released above.
1039          */
1040         if (!retry)
1041                 return NULL;
1042
1043         /* We're _really_ low on memory. Now we just
1044          * wait for old buffer heads to become free due to
1045          * finishing IO.  Since this is an async request and
1046          * the reserve list is empty, we're sure there are
1047          * async buffer heads in use.
1048          */
1049         free_more_memory();
1050         goto try_again;
1051 }
1052
1053 static inline void
1054 link_dev_buffers(struct page *page, struct buffer_head *head)
1055 {
1056         struct buffer_head *bh, *tail;
1057
1058         bh = head;
1059         do {
1060                 tail = bh;
1061                 bh = bh->b_this_page;
1062         } while (bh);
1063         tail->b_this_page = head;
1064         __set_page_buffers(page, head);
1065 }
1066
1067 /*
1068  * Initialise the state of a blockdev page's buffers.
1069  */
1070 static void
1071 init_page_buffers(struct page *page, struct block_device *bdev,
1072                         int block, int size)
1073 {
1074         struct buffer_head *head = page_buffers(page);
1075         struct buffer_head *bh = head;
1076         unsigned int b_state;
1077
1078         b_state = 1 << BH_Mapped;
1079         if (PageUptodate(page))
1080                 b_state |= 1 << BH_Uptodate;
1081
1082         do {
1083                 if (!(bh->b_state & (1 << BH_Mapped))) {
1084                         init_buffer(bh, NULL, NULL);
1085                         bh->b_bdev = bdev;
1086                         bh->b_blocknr = block;
1087                         bh->b_state = b_state;
1088                 }
1089                 block++;
1090                 bh = bh->b_this_page;
1091         } while (bh != head);
1092 }
1093
1094 /*
1095  * Create the page-cache page that contains the requested block.
1096  *
1097  * This is user purely for blockdev mappings.
1098  */
1099 static struct page *
1100 grow_dev_page(struct block_device *bdev, unsigned long block,
1101                         unsigned long index, int size)
1102 {
1103         struct inode *inode = bdev->bd_inode;
1104         struct page *page;
1105         struct buffer_head *bh;
1106
1107         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1108         if (!page)
1109                 return NULL;
1110
1111         if (!PageLocked(page))
1112                 BUG();
1113
1114         if (page_has_buffers(page)) {
1115                 bh = page_buffers(page);
1116                 if (bh->b_size == size)
1117                         return page;
1118                 if (!try_to_free_buffers(page))
1119                         goto failed;
1120         }
1121
1122         /*
1123          * Allocate some buffers for this page
1124          */
1125         bh = create_buffers(page, size, 0);
1126         if (!bh)
1127                 goto failed;
1128
1129         /*
1130          * Link the page to the buffers and initialise them.  Take the
1131          * lock to be atomic wrt __find_get_block(), which does not
1132          * run under the page lock.
1133          */
1134         spin_lock(&inode->i_mapping->private_lock);
1135         link_dev_buffers(page, bh);
1136         init_page_buffers(page, bdev, block, size);
1137         spin_unlock(&inode->i_mapping->private_lock);
1138         return page;
1139
1140 failed:
1141         buffer_error();
1142         unlock_page(page);
1143         page_cache_release(page);
1144         return NULL;
1145 }
1146
1147 /*
1148  * Create buffers for the specified block device block's page.  If
1149  * that page was dirty, the buffers are set dirty also.
1150  *
1151  * Except that's a bug.  Attaching dirty buffers to a dirty
1152  * blockdev's page can result in filesystem corruption, because
1153  * some of those buffers may be aliases of filesystem data.
1154  * grow_dev_page() will go BUG() if this happens.
1155  */
1156 static inline int
1157 grow_buffers(struct block_device *bdev, unsigned long block, int size)
1158 {
1159         struct page *page;
1160         unsigned long index;
1161         int sizebits;
1162
1163         /* Size must be multiple of hard sectorsize */
1164         if (size & (bdev_hardsect_size(bdev)-1))
1165                 BUG();
1166         if (size < 512 || size > PAGE_SIZE)
1167                 BUG();
1168
1169         sizebits = -1;
1170         do {
1171                 sizebits++;
1172         } while ((size << sizebits) < PAGE_SIZE);
1173
1174         index = block >> sizebits;
1175         block = index << sizebits;
1176
1177         /* Create a page with the proper size buffers.. */
1178         page = grow_dev_page(bdev, block, index, size);
1179         if (!page)
1180                 return 0;
1181         unlock_page(page);
1182         page_cache_release(page);
1183         return 1;
1184 }
1185
1186 struct buffer_head *
1187 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1188 {
1189         for (;;) {
1190                 struct buffer_head * bh;
1191
1192                 bh = __find_get_block(bdev, block, size);
1193                 if (bh)
1194                         return bh;
1195
1196                 if (!grow_buffers(bdev, block, size))
1197                         free_more_memory();
1198         }
1199 }
1200
1201 /*
1202  * The relationship between dirty buffers and dirty pages:
1203  *
1204  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1205  * the page appears on its address_space.dirty_pages list.
1206  *
1207  * At all times, the dirtiness of the buffers represents the dirtiness of
1208  * subsections of the page.  If the page has buffers, the page dirty bit is
1209  * merely a hint about the true dirty state.
1210  *
1211  * When a page is set dirty in its entirety, all its buffers are marked dirty
1212  * (if the page has buffers).
1213  *
1214  * When a buffer is marked dirty, its page is dirtied, but the page's other
1215  * buffers are not.
1216  *
1217  * Also.  When blockdev buffers are explicitly read with bread(), they
1218  * individually become uptodate.  But their backing page remains not
1219  * uptodate - even if all of its buffers are uptodate.  A subsequent
1220  * block_read_full_page() against that page will discover all the uptodate
1221  * buffers, will set the page uptodate and will perform no I/O.
1222  */
1223
1224 /**
1225  * mark_buffer_dirty - mark a buffer_head as needing writeout
1226  *
1227  * mark_buffer_dirty() will set the dirty bit against the buffer,
1228  * then set its backing page dirty, then attach the page to its
1229  * address_space's dirty_pages list and then attach the address_space's
1230  * inode to its superblock's dirty inode list.
1231  *
1232  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1233  * mapping->page_lock and the global inode_lock.
1234  */
1235 void mark_buffer_dirty(struct buffer_head *bh)
1236 {
1237         if (!buffer_uptodate(bh))
1238                 buffer_error();
1239         if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1240                 __set_page_dirty_nobuffers(bh->b_page);
1241 }
1242
1243 /*
1244  * Decrement a buffer_head's reference count.  If all buffers against a page
1245  * have zero reference count, are clean and unlocked, and if the page is clean
1246  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1247  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1248  * a page but it ends up not being freed, and buffers may later be reattached).
1249  */
1250 void __brelse(struct buffer_head * buf)
1251 {
1252         if (atomic_read(&buf->b_count)) {
1253                 put_bh(buf);
1254                 return;
1255         }
1256         printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1257         buffer_error();         /* For the stack backtrace */
1258 }
1259
1260 /*
1261  * bforget() is like brelse(), except it discards any
1262  * potentially dirty data.
1263  */
1264 void __bforget(struct buffer_head *bh)
1265 {
1266         clear_buffer_dirty(bh);
1267         if (!list_empty(&bh->b_assoc_buffers)) {
1268                 struct address_space *buffer_mapping = bh->b_page->mapping;
1269
1270                 spin_lock(&buffer_mapping->private_lock);
1271                 list_del_init(&bh->b_assoc_buffers);
1272                 spin_unlock(&buffer_mapping->private_lock);
1273         }
1274         __brelse(bh);
1275 }
1276
1277 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1278 {
1279         lock_buffer(bh);
1280         if (buffer_uptodate(bh)) {
1281                 unlock_buffer(bh);
1282                 return bh;
1283         } else {
1284                 if (buffer_dirty(bh))
1285                         buffer_error();
1286                 get_bh(bh);
1287                 bh->b_end_io = end_buffer_io_sync;
1288                 submit_bh(READ, bh);
1289                 wait_on_buffer(bh);
1290                 if (buffer_uptodate(bh))
1291                         return bh;
1292         }
1293         brelse(bh);
1294         return NULL;
1295 }
1296
1297 /*
1298  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1299  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1300  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1301  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1302  * CPU's LRUs at the same time.
1303  *
1304  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1305  * sb_find_get_block().
1306  *
1307  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1308  * a local interrupt disable for that.
1309  */
1310
1311 #define BH_LRU_SIZE     8
1312
1313 struct bh_lru {
1314         struct buffer_head *bhs[BH_LRU_SIZE];
1315 };
1316
1317 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}};
1318
1319 #ifdef CONFIG_SMP
1320 #define bh_lru_lock()   local_irq_disable()
1321 #define bh_lru_unlock() local_irq_enable()
1322 #else
1323 #define bh_lru_lock()   preempt_disable()
1324 #define bh_lru_unlock() preempt_enable()
1325 #endif
1326
1327 static inline void check_irqs_on(void)
1328 {
1329 #ifdef irqs_disabled
1330         BUG_ON(irqs_disabled());
1331 #endif
1332 }
1333
1334 /*
1335  * The LRU management algorithm is dopey-but-simple.  Sorry.
1336  */
1337 static void bh_lru_install(struct buffer_head *bh)
1338 {
1339         struct buffer_head *evictee = NULL;
1340         struct bh_lru *lru;
1341
1342         check_irqs_on();
1343         bh_lru_lock();
1344         lru = &__get_cpu_var(bh_lrus);
1345         if (lru->bhs[0] != bh) {
1346                 struct buffer_head *bhs[BH_LRU_SIZE];
1347                 int in;
1348                 int out = 0;
1349
1350                 get_bh(bh);
1351                 bhs[out++] = bh;
1352                 for (in = 0; in < BH_LRU_SIZE; in++) {
1353                         struct buffer_head *bh2 = lru->bhs[in];
1354
1355                         if (bh2 == bh) {
1356                                 __brelse(bh2);
1357                         } else {
1358                                 if (out >= BH_LRU_SIZE) {
1359                                         BUG_ON(evictee != NULL);
1360                                         evictee = bh2;
1361                                 } else {
1362                                         bhs[out++] = bh2;
1363                                 }
1364                         }
1365                 }
1366                 while (out < BH_LRU_SIZE)
1367                         bhs[out++] = NULL;
1368                 memcpy(lru->bhs, bhs, sizeof(bhs));
1369         }
1370         bh_lru_unlock();
1371
1372         if (evictee)
1373                 __brelse(evictee);
1374 }
1375
1376 /*
1377  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1378  */
1379 static inline struct buffer_head *
1380 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1381 {
1382         struct buffer_head *ret = NULL;
1383         struct bh_lru *lru;
1384         int i;
1385
1386         check_irqs_on();
1387         bh_lru_lock();
1388         lru = &__get_cpu_var(bh_lrus);
1389         for (i = 0; i < BH_LRU_SIZE; i++) {
1390                 struct buffer_head *bh = lru->bhs[i];
1391
1392                 if (bh && bh->b_bdev == bdev &&
1393                                 bh->b_blocknr == block && bh->b_size == size) {
1394                         if (i) {
1395                                 while (i) {
1396                                         lru->bhs[i] = lru->bhs[i - 1];
1397                                         i--;
1398                                 }
1399                                 lru->bhs[0] = bh;
1400                         }
1401                         get_bh(bh);
1402                         ret = bh;
1403                         break;
1404                 }
1405         }
1406         bh_lru_unlock();
1407         return ret;
1408 }
1409
1410 /*
1411  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1412  * it in the LRU and mark it as accessed.  If it is not present then return
1413  * NULL
1414  */
1415 struct buffer_head *
1416 __find_get_block(struct block_device *bdev, sector_t block, int size)
1417 {
1418         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1419
1420         if (bh == NULL) {
1421                 bh = __find_get_block_slow(bdev, block, size);
1422                 if (bh)
1423                         bh_lru_install(bh);
1424         }
1425         if (bh)
1426                 touch_buffer(bh);
1427         return bh;
1428 }
1429 EXPORT_SYMBOL(__find_get_block);
1430
1431 /*
1432  * __getblk will locate (and, if necessary, create) the buffer_head
1433  * which corresponds to the passed block_device, block and size. The
1434  * returned buffer has its reference count incremented.
1435  *
1436  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1437  * illegal block number, __getblk() will happily return a buffer_head
1438  * which represents the non-existent block.  Very weird.
1439  *
1440  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1441  * attempt is failing.  FIXME, perhaps?
1442  */
1443 struct buffer_head *
1444 __getblk(struct block_device *bdev, sector_t block, int size)
1445 {
1446         struct buffer_head *bh = __find_get_block(bdev, block, size);
1447
1448         if (bh == NULL)
1449                 bh = __getblk_slow(bdev, block, size);
1450         return bh;
1451 }
1452 EXPORT_SYMBOL(__getblk);
1453
1454 /*
1455  * Do async read-ahead on a buffer..
1456  */
1457 void __breadahead(struct block_device *bdev, sector_t block, int size)
1458 {
1459         struct buffer_head *bh = __getblk(bdev, block, size);
1460         ll_rw_block(READA, 1, &bh);
1461         brelse(bh);
1462 }
1463 EXPORT_SYMBOL(__breadahead);
1464
1465 /**
1466  *  __bread() - reads a specified block and returns the bh
1467  *  @block: number of block
1468  *  @size: size (in bytes) to read
1469  *
1470  *  Reads a specified block, and returns buffer head that contains it.
1471  *  It returns NULL if the block was unreadable.
1472  */
1473 struct buffer_head *
1474 __bread(struct block_device *bdev, sector_t block, int size)
1475 {
1476         struct buffer_head *bh = __getblk(bdev, block, size);
1477
1478         if (!buffer_uptodate(bh))
1479                 bh = __bread_slow(bh);
1480         return bh;
1481 }
1482 EXPORT_SYMBOL(__bread);
1483
1484 /*
1485  * invalidate_bh_lrus() is called rarely - at unmount.  Because it is only for
1486  * unmount it only needs to ensure that all buffers from the target device are
1487  * invalidated on return and it doesn't need to worry about new buffers from
1488  * that device being added - the unmount code has to prevent that.
1489  */
1490 static void invalidate_bh_lru(void *arg)
1491 {
1492         struct bh_lru *b = &get_cpu_var(bh_lrus);
1493         int i;
1494
1495         for (i = 0; i < BH_LRU_SIZE; i++) {
1496                 brelse(b->bhs[i]);
1497                 b->bhs[i] = NULL;
1498         }
1499         put_cpu_var(bh_lrus);
1500 }
1501
1502 static void invalidate_bh_lrus(void)
1503 {
1504         on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1505 }
1506
1507 void set_bh_page(struct buffer_head *bh,
1508                 struct page *page, unsigned long offset)
1509 {
1510         bh->b_page = page;
1511         if (offset >= PAGE_SIZE)
1512                 BUG();
1513         if (PageHighMem(page))
1514                 /*
1515                  * This catches illegal uses and preserves the offset:
1516                  */
1517                 bh->b_data = (char *)(0 + offset);
1518         else
1519                 bh->b_data = page_address(page) + offset;
1520 }
1521 EXPORT_SYMBOL(set_bh_page);
1522
1523 /*
1524  * Called when truncating a buffer on a page completely.
1525  */
1526 static inline void discard_buffer(struct buffer_head * bh)
1527 {
1528         lock_buffer(bh);
1529         clear_buffer_dirty(bh);
1530         bh->b_bdev = NULL;
1531         clear_buffer_mapped(bh);
1532         clear_buffer_req(bh);
1533         clear_buffer_new(bh);
1534         clear_buffer_delay(bh);
1535         unlock_buffer(bh);
1536 }
1537
1538 /**
1539  * try_to_release_page() - release old fs-specific metadata on a page
1540  *
1541  * @page: the page which the kernel is trying to free
1542  * @gfp_mask: memory allocation flags (and I/O mode)
1543  *
1544  * The address_space is to try to release any data against the page
1545  * (presumably at page->private).  If the release was successful, return `1'.
1546  * Otherwise return zero.
1547  *
1548  * The @gfp_mask argument specifies whether I/O may be performed to release
1549  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1550  *
1551  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1552  */
1553 int try_to_release_page(struct page *page, int gfp_mask)
1554 {
1555         struct address_space * const mapping = page->mapping;
1556
1557         if (!PageLocked(page))
1558                 BUG();
1559         if (PageWriteback(page))
1560                 return 0;
1561
1562         if (mapping && mapping->a_ops->releasepage)
1563                 return mapping->a_ops->releasepage(page, gfp_mask);
1564         return try_to_free_buffers(page);
1565 }
1566
1567 /**
1568  * block_invalidatepage - invalidate part of all of a buffer-backed page
1569  *
1570  * @page: the page which is affected
1571  * @offset: the index of the truncation point
1572  *
1573  * block_invalidatepage() is called when all or part of the page has become
1574  * invalidatedby a truncate operation.
1575  *
1576  * block_invalidatepage() does not have to release all buffers, but it must
1577  * ensure that no dirty buffer is left outside @offset and that no I/O
1578  * is underway against any of the blocks which are outside the truncation
1579  * point.  Because the caller is about to free (and possibly reuse) those
1580  * blocks on-disk.
1581  */
1582 int block_invalidatepage(struct page *page, unsigned long offset)
1583 {
1584         struct buffer_head *head, *bh, *next;
1585         unsigned int curr_off = 0;
1586         int ret = 1;
1587
1588         BUG_ON(!PageLocked(page));
1589         if (!page_has_buffers(page))
1590                 goto out;
1591
1592         head = page_buffers(page);
1593         bh = head;
1594         do {
1595                 unsigned int next_off = curr_off + bh->b_size;
1596                 next = bh->b_this_page;
1597
1598                 /*
1599                  * is this block fully invalidated?
1600                  */
1601                 if (offset <= curr_off)
1602                         discard_buffer(bh);
1603                 curr_off = next_off;
1604                 bh = next;
1605         } while (bh != head);
1606
1607         /*
1608          * We release buffers only if the entire page is being invalidated.
1609          * The get_block cached value has been unconditionally invalidated,
1610          * so real IO is not possible anymore.
1611          */
1612         if (offset == 0)
1613                 ret = try_to_release_page(page, 0);
1614 out:
1615         return ret;
1616 }
1617 EXPORT_SYMBOL(block_invalidatepage);
1618
1619 /*
1620  * We attach and possibly dirty the buffers atomically wrt
1621  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1622  * is already excluded via the page lock.
1623  */
1624 void create_empty_buffers(struct page *page,
1625                         unsigned long blocksize, unsigned long b_state)
1626 {
1627         struct buffer_head *bh, *head, *tail;
1628
1629         head = create_buffers(page, blocksize, 1);
1630         bh = head;
1631         do {
1632                 bh->b_state |= b_state;
1633                 tail = bh;
1634                 bh = bh->b_this_page;
1635         } while (bh);
1636         tail->b_this_page = head;
1637
1638         spin_lock(&page->mapping->private_lock);
1639         if (PageUptodate(page) || PageDirty(page)) {
1640                 bh = head;
1641                 do {
1642                         if (PageDirty(page))
1643                                 set_buffer_dirty(bh);
1644                         if (PageUptodate(page))
1645                                 set_buffer_uptodate(bh);
1646                         bh = bh->b_this_page;
1647                 } while (bh != head);
1648         }
1649         __set_page_buffers(page, head);
1650         spin_unlock(&page->mapping->private_lock);
1651 }
1652 EXPORT_SYMBOL(create_empty_buffers);
1653
1654 /*
1655  * We are taking a block for data and we don't want any output from any
1656  * buffer-cache aliases starting from return from that function and
1657  * until the moment when something will explicitly mark the buffer
1658  * dirty (hopefully that will not happen until we will free that block ;-)
1659  * We don't even need to mark it not-uptodate - nobody can expect
1660  * anything from a newly allocated buffer anyway. We used to used
1661  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1662  * don't want to mark the alias unmapped, for example - it would confuse
1663  * anyone who might pick it with bread() afterwards...
1664  *
1665  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1666  * be writeout I/O going on against recently-freed buffers.  We don't
1667  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1668  * only if we really need to.  That happens here.
1669  */
1670 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1671 {
1672         struct buffer_head *old_bh;
1673
1674         old_bh = __find_get_block_slow(bdev, block, 0);
1675         if (old_bh) {
1676 #if 0   /* This happens.  Later. */
1677                 if (buffer_dirty(old_bh))
1678                         buffer_error();
1679 #endif
1680                 clear_buffer_dirty(old_bh);
1681                 wait_on_buffer(old_bh);
1682                 clear_buffer_req(old_bh);
1683                 __brelse(old_bh);
1684         }
1685 }
1686 EXPORT_SYMBOL(unmap_underlying_metadata);
1687
1688 /*
1689  * NOTE! All mapped/uptodate combinations are valid:
1690  *
1691  *      Mapped  Uptodate        Meaning
1692  *
1693  *      No      No              "unknown" - must do get_block()
1694  *      No      Yes             "hole" - zero-filled
1695  *      Yes     No              "allocated" - allocated on disk, not read in
1696  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1697  *
1698  * "Dirty" is valid only with the last case (mapped+uptodate).
1699  */
1700
1701 /*
1702  * While block_write_full_page is writing back the dirty buffers under
1703  * the page lock, whoever dirtied the buffers may decide to clean them
1704  * again at any time.  We handle that by only looking at the buffer
1705  * state inside lock_buffer().
1706  *
1707  * If block_write_full_page() is called for regular writeback
1708  * (called_for_sync() is false) then it will redirty a page which has a locked
1709  * buffer.   This only can happen if someone has written the buffer directly,
1710  * with submit_bh().  At the address_space level PageWriteback prevents this
1711  * contention from occurring.
1712  */
1713 static int __block_write_full_page(struct inode *inode, struct page *page,
1714                         get_block_t *get_block, struct writeback_control *wbc)
1715 {
1716         int err;
1717         unsigned long block;
1718         unsigned long last_block;
1719         struct buffer_head *bh, *head;
1720         int nr_underway = 0;
1721
1722         BUG_ON(!PageLocked(page));
1723
1724         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1725
1726         if (!page_has_buffers(page)) {
1727                 if (!PageUptodate(page))
1728                         buffer_error();
1729                 create_empty_buffers(page, 1 << inode->i_blkbits,
1730                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1731         }
1732
1733         /*
1734          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1735          * here, and the (potentially unmapped) buffers may become dirty at
1736          * any time.  If a buffer becomes dirty here after we've inspected it
1737          * then we just miss that fact, and the page stays dirty.
1738          *
1739          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1740          * handle that here by just cleaning them.
1741          */
1742
1743         block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1744         head = page_buffers(page);
1745         bh = head;
1746
1747         /*
1748          * Get all the dirty buffers mapped to disk addresses and
1749          * handle any aliases from the underlying blockdev's mapping.
1750          */
1751         do {
1752                 if (block > last_block) {
1753                         /*
1754                          * mapped buffers outside i_size will occur, because
1755                          * this page can be outside i_size when there is a
1756                          * truncate in progress.
1757                          *
1758                          * if (buffer_mapped(bh))
1759                          *      buffer_error();
1760                          */
1761                         /*
1762                          * The buffer was zeroed by block_write_full_page()
1763                          */
1764                         clear_buffer_dirty(bh);
1765                         set_buffer_uptodate(bh);
1766                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1767                         if (buffer_new(bh))
1768                                 buffer_error();
1769                         err = get_block(inode, block, bh, 1);
1770                         if (err)
1771                                 goto recover;
1772                         if (buffer_new(bh)) {
1773                                 /* blockdev mappings never come here */
1774                                 clear_buffer_new(bh);
1775                                 unmap_underlying_metadata(bh->b_bdev,
1776                                                         bh->b_blocknr);
1777                         }
1778                 }
1779                 bh = bh->b_this_page;
1780                 block++;
1781         } while (bh != head);
1782
1783         do {
1784                 get_bh(bh);
1785                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1786                         if (wbc->sync_mode != WB_SYNC_NONE) {
1787                                 lock_buffer(bh);
1788                         } else {
1789                                 if (test_set_buffer_locked(bh)) {
1790                                         __set_page_dirty_nobuffers(page);
1791                                         continue;
1792                                 }
1793                         }
1794                         if (test_clear_buffer_dirty(bh)) {
1795                                 if (!buffer_uptodate(bh))
1796                                         buffer_error();
1797                                 mark_buffer_async_write(bh);
1798                         } else {
1799                                 unlock_buffer(bh);
1800                         }
1801                 }
1802         } while ((bh = bh->b_this_page) != head);
1803
1804         BUG_ON(PageWriteback(page));
1805         SetPageWriteback(page);         /* Keeps try_to_free_buffers() away */
1806         unlock_page(page);
1807
1808         /*
1809          * The page may come unlocked any time after the *first* submit_bh()
1810          * call.  Be careful with its buffers.
1811          */
1812         do {
1813                 struct buffer_head *next = bh->b_this_page;
1814                 if (buffer_async_write(bh)) {
1815                         submit_bh(WRITE, bh);
1816                         nr_underway++;
1817                 }
1818                 put_bh(bh);
1819                 bh = next;
1820         } while (bh != head);
1821
1822         err = 0;
1823 done:
1824         if (nr_underway == 0) {
1825                 /*
1826                  * The page was marked dirty, but the buffers were
1827                  * clean.  Someone wrote them back by hand with
1828                  * ll_rw_block/submit_bh.  A rare case.
1829                  */
1830                 int uptodate = 1;
1831                 do {
1832                         if (!buffer_uptodate(bh)) {
1833                                 uptodate = 0;
1834                                 break;
1835                         }
1836                         bh = bh->b_this_page;
1837                 } while (bh != head);
1838                 if (uptodate)
1839                         SetPageUptodate(page);
1840                 end_page_writeback(page);
1841         }
1842         return err;
1843
1844 recover:
1845         /*
1846          * ENOSPC, or some other error.  We may already have added some
1847          * blocks to the file, so we need to write these out to avoid
1848          * exposing stale data.
1849          * The page is currently locked and not marked for writeback
1850          */
1851         bh = head;
1852         /* Recovery: lock and submit the mapped buffers */
1853         do {
1854                 get_bh(bh);
1855                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1856                         lock_buffer(bh);
1857                         mark_buffer_async_write(bh);
1858                 } else {
1859                         /*
1860                          * The buffer may have been set dirty during
1861                          * attachment to a dirty page.
1862                          */
1863                         clear_buffer_dirty(bh);
1864                 }
1865         } while ((bh = bh->b_this_page) != head);
1866         SetPageError(page);
1867         BUG_ON(PageWriteback(page));
1868         SetPageWriteback(page);
1869         unlock_page(page);
1870         do {
1871                 struct buffer_head *next = bh->b_this_page;
1872                 if (buffer_async_write(bh)) {
1873                         clear_buffer_dirty(bh);
1874                         submit_bh(WRITE, bh);
1875                         nr_underway++;
1876                 }
1877                 put_bh(bh);
1878                 bh = next;
1879         } while (bh != head);
1880         goto done;
1881 }
1882
1883 static int __block_prepare_write(struct inode *inode, struct page *page,
1884                 unsigned from, unsigned to, get_block_t *get_block)
1885 {
1886         unsigned block_start, block_end;
1887         sector_t block;
1888         int err = 0;
1889         unsigned blocksize, bbits;
1890         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1891
1892         BUG_ON(!PageLocked(page));
1893         BUG_ON(from > PAGE_CACHE_SIZE);
1894         BUG_ON(to > PAGE_CACHE_SIZE);
1895         BUG_ON(from > to);
1896
1897         blocksize = 1 << inode->i_blkbits;
1898         if (!page_has_buffers(page))
1899                 create_empty_buffers(page, blocksize, 0);
1900         head = page_buffers(page);
1901
1902         bbits = inode->i_blkbits;
1903         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1904
1905         for(bh = head, block_start = 0; bh != head || !block_start;
1906             block++, block_start=block_end, bh = bh->b_this_page) {
1907                 block_end = block_start + blocksize;
1908                 if (block_end <= from || block_start >= to) {
1909                         if (PageUptodate(page)) {
1910                                 if (!buffer_uptodate(bh))
1911                                         set_buffer_uptodate(bh);
1912                         }
1913                         continue;
1914                 }
1915                 if (buffer_new(bh))
1916                         clear_buffer_new(bh);
1917                 if (!buffer_mapped(bh)) {
1918                         err = get_block(inode, block, bh, 1);
1919                         if (err)
1920                                 goto out;
1921                         if (buffer_new(bh)) {
1922                                 clear_buffer_new(bh);
1923                                 unmap_underlying_metadata(bh->b_bdev,
1924                                                         bh->b_blocknr);
1925                                 if (PageUptodate(page)) {
1926                                         if (!buffer_mapped(bh))
1927                                                 buffer_error();
1928                                         set_buffer_uptodate(bh);
1929                                         continue;
1930                                 }
1931                                 if (block_end > to || block_start < from) {
1932                                         void *kaddr;
1933
1934                                         kaddr = kmap_atomic(page, KM_USER0);
1935                                         if (block_end > to)
1936                                                 memset(kaddr+to, 0,
1937                                                         block_end-to);
1938                                         if (block_start < from)
1939                                                 memset(kaddr+block_start,
1940                                                         0, from-block_start);
1941                                         flush_dcache_page(page);
1942                                         kunmap_atomic(kaddr, KM_USER0);
1943                                 }
1944                                 continue;
1945                         }
1946                 }
1947                 if (PageUptodate(page)) {
1948                         if (!buffer_uptodate(bh))
1949                                 set_buffer_uptodate(bh);
1950                         continue;
1951                 }
1952                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1953                      (block_start < from || block_end > to)) {
1954                         ll_rw_block(READ, 1, &bh);
1955                         *wait_bh++=bh;
1956                 }
1957         }
1958         /*
1959          * If we issued read requests - let them complete.
1960          */
1961         while(wait_bh > wait) {
1962                 wait_on_buffer(*--wait_bh);
1963                 if (!buffer_uptodate(*wait_bh))
1964                         return -EIO;
1965         }
1966         return 0;
1967 out:
1968         /*
1969          * Zero out any newly allocated blocks to avoid exposing stale
1970          * data.  If BH_New is set, we know that the block was newly
1971          * allocated in the above loop.
1972          */
1973         bh = head;
1974         block_start = 0;
1975         do {
1976                 block_end = block_start+blocksize;
1977                 if (block_end <= from)
1978                         goto next_bh;
1979                 if (block_start >= to)
1980                         break;
1981                 if (buffer_new(bh)) {
1982                         void *kaddr;
1983
1984                         clear_buffer_new(bh);
1985                         if (buffer_uptodate(bh))
1986                                 buffer_error();
1987                         kaddr = kmap_atomic(page, KM_USER0);
1988                         memset(kaddr+block_start, 0, bh->b_size);
1989                         kunmap_atomic(kaddr, KM_USER0);
1990                         set_buffer_uptodate(bh);
1991                         mark_buffer_dirty(bh);
1992                 }
1993 next_bh:
1994                 block_start = block_end;
1995                 bh = bh->b_this_page;
1996         } while (bh != head);
1997         return err;
1998 }
1999
2000 static int __block_commit_write(struct inode *inode, struct page *page,
2001                 unsigned from, unsigned to)
2002 {
2003         unsigned block_start, block_end;
2004         int partial = 0;
2005         unsigned blocksize;
2006         struct buffer_head *bh, *head;
2007
2008         blocksize = 1 << inode->i_blkbits;
2009
2010         for(bh = head = page_buffers(page), block_start = 0;
2011             bh != head || !block_start;
2012             block_start=block_end, bh = bh->b_this_page) {
2013                 block_end = block_start + blocksize;
2014                 if (block_end <= from || block_start >= to) {
2015                         if (!buffer_uptodate(bh))
2016                                 partial = 1;
2017                 } else {
2018                         set_buffer_uptodate(bh);
2019                         mark_buffer_dirty(bh);
2020                 }
2021         }
2022
2023         /*
2024          * If this is a partial write which happened to make all buffers
2025          * uptodate then we can optimize away a bogus readpage() for
2026          * the next read(). Here we 'discover' whether the page went
2027          * uptodate as a result of this (potentially partial) write.
2028          */
2029         if (!partial)
2030                 SetPageUptodate(page);
2031         return 0;
2032 }
2033
2034 /*
2035  * Generic "read page" function for block devices that have the normal
2036  * get_block functionality. This is most of the block device filesystems.
2037  * Reads the page asynchronously --- the unlock_buffer() and
2038  * set/clear_buffer_uptodate() functions propagate buffer state into the
2039  * page struct once IO has completed.
2040  */
2041 int block_read_full_page(struct page *page, get_block_t *get_block)
2042 {
2043         struct inode *inode = page->mapping->host;
2044         sector_t iblock, lblock;
2045         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2046         unsigned int blocksize;
2047         int nr, i;
2048         int fully_mapped = 1;
2049
2050         if (!PageLocked(page))
2051                 PAGE_BUG(page);
2052         if (PageUptodate(page))
2053                 buffer_error();
2054         blocksize = 1 << inode->i_blkbits;
2055         if (!page_has_buffers(page))
2056                 create_empty_buffers(page, blocksize, 0);
2057         head = page_buffers(page);
2058
2059         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2060         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2061         bh = head;
2062         nr = 0;
2063         i = 0;
2064
2065         do {
2066                 if (buffer_uptodate(bh))
2067                         continue;
2068
2069                 if (!buffer_mapped(bh)) {
2070                         fully_mapped = 0;
2071                         if (iblock < lblock) {
2072                                 if (get_block(inode, iblock, bh, 0))
2073                                         SetPageError(page);
2074                         }
2075                         if (!buffer_mapped(bh)) {
2076                                 void *kaddr = kmap_atomic(page, KM_USER0);
2077                                 memset(kaddr + i * blocksize, 0, blocksize);
2078                                 flush_dcache_page(page);
2079                                 kunmap_atomic(kaddr, KM_USER0);
2080                                 set_buffer_uptodate(bh);
2081                                 continue;
2082                         }
2083                         /*
2084                          * get_block() might have updated the buffer
2085                          * synchronously
2086                          */
2087                         if (buffer_uptodate(bh))
2088                                 continue;
2089                 }
2090                 arr[nr++] = bh;
2091         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2092
2093         if (fully_mapped)
2094                 SetPageMappedToDisk(page);
2095
2096         if (!nr) {
2097                 /*
2098                  * All buffers are uptodate - we can set the page uptodate
2099                  * as well. But not if get_block() returned an error.
2100                  */
2101                 if (!PageError(page))
2102                         SetPageUptodate(page);
2103                 unlock_page(page);
2104                 return 0;
2105         }
2106
2107         /* Stage two: lock the buffers */
2108         for (i = 0; i < nr; i++) {
2109                 bh = arr[i];
2110                 lock_buffer(bh);
2111                 mark_buffer_async_read(bh);
2112         }
2113
2114         /*
2115          * Stage 3: start the IO.  Check for uptodateness
2116          * inside the buffer lock in case another process reading
2117          * the underlying blockdev brought it uptodate (the sct fix).
2118          */
2119         for (i = 0; i < nr; i++) {
2120                 bh = arr[i];
2121                 if (buffer_uptodate(bh))
2122                         end_buffer_async_read(bh, 1);
2123                 else
2124                         submit_bh(READ, bh);
2125         }
2126         return 0;
2127 }
2128
2129 /* utility function for filesystems that need to do work on expanding
2130  * truncates.  Uses prepare/commit_write to allow the filesystem to
2131  * deal with the hole.
2132  */
2133 int generic_cont_expand(struct inode *inode, loff_t size)
2134 {
2135         struct address_space *mapping = inode->i_mapping;
2136         struct page *page;
2137         unsigned long index, offset, limit;
2138         int err;
2139
2140         err = -EFBIG;
2141         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2142         if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2143                 send_sig(SIGXFSZ, current, 0);
2144                 goto out;
2145         }
2146         if (size > inode->i_sb->s_maxbytes)
2147                 goto out;
2148
2149         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2150
2151         /* ugh.  in prepare/commit_write, if from==to==start of block, we
2152         ** skip the prepare.  make sure we never send an offset for the start
2153         ** of a block
2154         */
2155         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2156                 offset++;
2157         }
2158         index = size >> PAGE_CACHE_SHIFT;
2159         err = -ENOMEM;
2160         page = grab_cache_page(mapping, index);
2161         if (!page)
2162                 goto out;
2163         err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2164         if (!err) {
2165                 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2166         }
2167         unlock_page(page);
2168         page_cache_release(page);
2169         if (err > 0)
2170                 err = 0;
2171 out:
2172         return err;
2173 }
2174
2175 /*
2176  * For moronic filesystems that do not allow holes in file.
2177  * We may have to extend the file.
2178  */
2179
2180 int cont_prepare_write(struct page *page, unsigned offset,
2181                 unsigned to, get_block_t *get_block, loff_t *bytes)
2182 {
2183         struct address_space *mapping = page->mapping;
2184         struct inode *inode = mapping->host;
2185         struct page *new_page;
2186         unsigned long pgpos;
2187         long status;
2188         unsigned zerofrom;
2189         unsigned blocksize = 1 << inode->i_blkbits;
2190         void *kaddr;
2191
2192         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2193                 status = -ENOMEM;
2194                 new_page = grab_cache_page(mapping, pgpos);
2195                 if (!new_page)
2196                         goto out;
2197                 /* we might sleep */
2198                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2199                         unlock_page(new_page);
2200                         page_cache_release(new_page);
2201                         continue;
2202                 }
2203                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2204                 if (zerofrom & (blocksize-1)) {
2205                         *bytes |= (blocksize-1);
2206                         (*bytes)++;
2207                 }
2208                 status = __block_prepare_write(inode, new_page, zerofrom,
2209                                                 PAGE_CACHE_SIZE, get_block);
2210                 if (status)
2211                         goto out_unmap;
2212                 kaddr = kmap_atomic(new_page, KM_USER0);
2213                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2214                 flush_dcache_page(new_page);
2215                 kunmap_atomic(kaddr, KM_USER0);
2216                 __block_commit_write(inode, new_page,
2217                                 zerofrom, PAGE_CACHE_SIZE);
2218                 unlock_page(new_page);
2219                 page_cache_release(new_page);
2220         }
2221
2222         if (page->index < pgpos) {
2223                 /* completely inside the area */
2224                 zerofrom = offset;
2225         } else {
2226                 /* page covers the boundary, find the boundary offset */
2227                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2228
2229                 /* if we will expand the thing last block will be filled */
2230                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2231                         *bytes |= (blocksize-1);
2232                         (*bytes)++;
2233                 }
2234
2235                 /* starting below the boundary? Nothing to zero out */
2236                 if (offset <= zerofrom)
2237                         zerofrom = offset;
2238         }
2239         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2240         if (status)
2241                 goto out1;
2242         if (zerofrom < offset) {
2243                 kaddr = kmap_atomic(page, KM_USER0);
2244                 memset(kaddr+zerofrom, 0, offset-zerofrom);
2245                 flush_dcache_page(page);
2246                 kunmap_atomic(kaddr, KM_USER0);
2247                 __block_commit_write(inode, page, zerofrom, offset);
2248         }
2249         return 0;
2250 out1:
2251         ClearPageUptodate(page);
2252         return status;
2253
2254 out_unmap:
2255         ClearPageUptodate(new_page);
2256         unlock_page(new_page);
2257         page_cache_release(new_page);
2258 out:
2259         return status;
2260 }
2261
2262 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2263                         get_block_t *get_block)
2264 {
2265         struct inode *inode = page->mapping->host;
2266         int err = __block_prepare_write(inode, page, from, to, get_block);
2267         if (err)
2268                 ClearPageUptodate(page);
2269         return err;
2270 }
2271
2272 int block_commit_write(struct page *page, unsigned from, unsigned to)
2273 {
2274         struct inode *inode = page->mapping->host;
2275         __block_commit_write(inode,page,from,to);
2276         return 0;
2277 }
2278
2279 int generic_commit_write(struct file *file, struct page *page,
2280                 unsigned from, unsigned to)
2281 {
2282         struct inode *inode = page->mapping->host;
2283         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2284         __block_commit_write(inode,page,from,to);
2285         /*
2286          * No need to use i_size_read() here, the i_size
2287          * cannot change under us because we hold i_sem.
2288          */
2289         if (pos > inode->i_size) {
2290                 i_size_write(inode, pos);
2291                 mark_inode_dirty(inode);
2292         }
2293         return 0;
2294 }
2295
2296 /*
2297  * On entry, the page is fully not uptodate.
2298  * On exit the page is fully uptodate in the areas outside (from,to)
2299  */
2300 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2301                         get_block_t *get_block)
2302 {
2303         struct inode *inode = page->mapping->host;
2304         const unsigned blkbits = inode->i_blkbits;
2305         const unsigned blocksize = 1 << blkbits;
2306         struct buffer_head map_bh;
2307         struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2308         unsigned block_in_page;
2309         unsigned block_start;
2310         sector_t block_in_file;
2311         char *kaddr;
2312         int nr_reads = 0;
2313         int i;
2314         int ret = 0;
2315         int is_mapped_to_disk = 1;
2316         int dirtied_it = 0;
2317
2318         if (PageMappedToDisk(page))
2319                 return 0;
2320
2321         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2322         map_bh.b_page = page;
2323
2324         /*
2325          * We loop across all blocks in the page, whether or not they are
2326          * part of the affected region.  This is so we can discover if the
2327          * page is fully mapped-to-disk.
2328          */
2329         for (block_start = 0, block_in_page = 0;
2330                   block_start < PAGE_CACHE_SIZE;
2331                   block_in_page++, block_start += blocksize) {
2332                 unsigned block_end = block_start + blocksize;
2333                 int create;
2334
2335                 map_bh.b_state = 0;
2336                 create = 1;
2337                 if (block_start >= to)
2338                         create = 0;
2339                 ret = get_block(inode, block_in_file + block_in_page,
2340                                         &map_bh, create);
2341                 if (ret)
2342                         goto failed;
2343                 if (!buffer_mapped(&map_bh))
2344                         is_mapped_to_disk = 0;
2345                 if (buffer_new(&map_bh))
2346                         unmap_underlying_metadata(map_bh.b_bdev,
2347                                                         map_bh.b_blocknr);
2348                 if (PageUptodate(page))
2349                         continue;
2350                 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2351                         kaddr = kmap_atomic(page, KM_USER0);
2352                         if (block_start < from) {
2353                                 memset(kaddr+block_start, 0, from-block_start);
2354                                 dirtied_it = 1;
2355                         }
2356                         if (block_end > to) {
2357                                 memset(kaddr + to, 0, block_end - to);
2358                                 dirtied_it = 1;
2359                         }
2360                         flush_dcache_page(page);
2361                         kunmap_atomic(kaddr, KM_USER0);
2362                         continue;
2363                 }
2364                 if (buffer_uptodate(&map_bh))
2365                         continue;       /* reiserfs does this */
2366                 if (block_start < from || block_end > to) {
2367                         struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2368
2369                         if (!bh) {
2370                                 ret = -ENOMEM;
2371                                 goto failed;
2372                         }
2373                         bh->b_state = map_bh.b_state;
2374                         atomic_set(&bh->b_count, 0);
2375                         bh->b_this_page = 0;
2376                         bh->b_page = page;
2377                         bh->b_blocknr = map_bh.b_blocknr;
2378                         bh->b_size = blocksize;
2379                         bh->b_data = (char *)(long)block_start;
2380                         bh->b_bdev = map_bh.b_bdev;
2381                         bh->b_private = NULL;
2382                         read_bh[nr_reads++] = bh;
2383                 }
2384         }
2385
2386         if (nr_reads) {
2387                 ll_rw_block(READ, nr_reads, read_bh);
2388                 for (i = 0; i < nr_reads; i++) {
2389                         wait_on_buffer(read_bh[i]);
2390                         if (!buffer_uptodate(read_bh[i]))
2391                                 ret = -EIO;
2392                         free_buffer_head(read_bh[i]);
2393                         read_bh[i] = NULL;
2394                 }
2395                 if (ret)
2396                         goto failed;
2397         }
2398
2399         if (is_mapped_to_disk)
2400                 SetPageMappedToDisk(page);
2401         SetPageUptodate(page);
2402
2403         /*
2404          * Setting the page dirty here isn't necessary for the prepare_write
2405          * function - commit_write will do that.  But if/when this function is
2406          * used within the pagefault handler to ensure that all mmapped pages
2407          * have backing space in the filesystem, we will need to dirty the page
2408          * if its contents were altered.
2409          */
2410         if (dirtied_it)
2411                 set_page_dirty(page);
2412
2413         return 0;
2414
2415 failed:
2416         for (i = 0; i < nr_reads; i++) {
2417                 if (read_bh[i])
2418                         free_buffer_head(read_bh[i]);
2419         }
2420
2421         /*
2422          * Error recovery is pretty slack.  Clear the page and mark it dirty
2423          * so we'll later zero out any blocks which _were_ allocated.
2424          */
2425         kaddr = kmap_atomic(page, KM_USER0);
2426         memset(kaddr, 0, PAGE_CACHE_SIZE);
2427         kunmap_atomic(kaddr, KM_USER0);
2428         SetPageUptodate(page);
2429         set_page_dirty(page);
2430         return ret;
2431 }
2432 EXPORT_SYMBOL(nobh_prepare_write);
2433
2434 int nobh_commit_write(struct file *file, struct page *page,
2435                 unsigned from, unsigned to)
2436 {
2437         struct inode *inode = page->mapping->host;
2438         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2439
2440         set_page_dirty(page);
2441         if (pos > inode->i_size) {
2442                 i_size_write(inode, pos);
2443                 mark_inode_dirty(inode);
2444         }
2445         return 0;
2446 }
2447 EXPORT_SYMBOL(nobh_commit_write);
2448
2449 /*
2450  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2451  */
2452 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2453 {
2454         struct inode *inode = mapping->host;
2455         unsigned blocksize = 1 << inode->i_blkbits;
2456         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2457         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2458         unsigned to;
2459         struct page *page;
2460         struct address_space_operations *a_ops = mapping->a_ops;
2461         char *kaddr;
2462         int ret = 0;
2463
2464         if ((offset & (blocksize - 1)) == 0)
2465                 goto out;
2466
2467         ret = -ENOMEM;
2468         page = grab_cache_page(mapping, index);
2469         if (!page)
2470                 goto out;
2471
2472         to = (offset + blocksize) & ~(blocksize - 1);
2473         ret = a_ops->prepare_write(NULL, page, offset, to);
2474         if (ret == 0) {
2475                 kaddr = kmap_atomic(page, KM_USER0);
2476                 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2477                 flush_dcache_page(page);
2478                 kunmap_atomic(kaddr, KM_USER0);
2479                 set_page_dirty(page);
2480         }
2481         unlock_page(page);
2482         page_cache_release(page);
2483 out:
2484         return ret;
2485 }
2486 EXPORT_SYMBOL(nobh_truncate_page);
2487
2488 int block_truncate_page(struct address_space *mapping,
2489                         loff_t from, get_block_t *get_block)
2490 {
2491         unsigned long index = from >> PAGE_CACHE_SHIFT;
2492         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2493         unsigned blocksize, iblock, length, pos;
2494         struct inode *inode = mapping->host;
2495         struct page *page;
2496         struct buffer_head *bh;
2497         void *kaddr;
2498         int err;
2499
2500         blocksize = 1 << inode->i_blkbits;
2501         length = offset & (blocksize - 1);
2502
2503         /* Block boundary? Nothing to do */
2504         if (!length)
2505                 return 0;
2506
2507         length = blocksize - length;
2508         iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2509
2510         page = grab_cache_page(mapping, index);
2511         err = -ENOMEM;
2512         if (!page)
2513                 goto out;
2514
2515         if (!page_has_buffers(page))
2516                 create_empty_buffers(page, blocksize, 0);
2517
2518         /* Find the buffer that contains "offset" */
2519         bh = page_buffers(page);
2520         pos = blocksize;
2521         while (offset >= pos) {
2522                 bh = bh->b_this_page;
2523                 iblock++;
2524                 pos += blocksize;
2525         }
2526
2527         err = 0;
2528         if (!buffer_mapped(bh)) {
2529                 err = get_block(inode, iblock, bh, 0);
2530                 if (err)
2531                         goto unlock;
2532                 /* unmapped? It's a hole - nothing to do */
2533                 if (!buffer_mapped(bh))
2534                         goto unlock;
2535         }
2536
2537         /* Ok, it's mapped. Make sure it's up-to-date */
2538         if (PageUptodate(page))
2539                 set_buffer_uptodate(bh);
2540
2541         if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2542                 err = -EIO;
2543                 ll_rw_block(READ, 1, &bh);
2544                 wait_on_buffer(bh);
2545                 /* Uhhuh. Read error. Complain and punt. */
2546                 if (!buffer_uptodate(bh))
2547                         goto unlock;
2548         }
2549
2550         kaddr = kmap_atomic(page, KM_USER0);
2551         memset(kaddr + offset, 0, length);
2552         flush_dcache_page(page);
2553         kunmap_atomic(kaddr, KM_USER0);
2554
2555         mark_buffer_dirty(bh);
2556         err = 0;
2557
2558 unlock:
2559         unlock_page(page);
2560         page_cache_release(page);
2561 out:
2562         return err;
2563 }
2564
2565 /*
2566  * The generic ->writepage function for buffer-backed address_spaces
2567  */
2568 int block_write_full_page(struct page *page, get_block_t *get_block,
2569                         struct writeback_control *wbc)
2570 {
2571         struct inode * const inode = page->mapping->host;
2572         loff_t i_size = i_size_read(inode);
2573         const unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2574         unsigned offset;
2575         void *kaddr;
2576
2577         /* Is the page fully inside i_size? */
2578         if (page->index < end_index)
2579                 return __block_write_full_page(inode, page, get_block, wbc);
2580
2581         /* Is the page fully outside i_size? (truncate in progress) */
2582         offset = i_size & (PAGE_CACHE_SIZE-1);
2583         if (page->index >= end_index+1 || !offset) {
2584                 /*
2585                  * The page may have dirty, unmapped buffers.  For example,
2586                  * they may have been added in ext3_writepage().  Make them
2587                  * freeable here, so the page does not leak.
2588                  */
2589                 block_invalidatepage(page, 0);
2590                 unlock_page(page);
2591                 return -EIO;
2592         }
2593
2594         /*
2595          * The page straddles i_size.  It must be zeroed out on each and every
2596          * writepage invocation because it may be mmapped.  "A file is mapped
2597          * in multiples of the page size.  For a file that is not a multiple of
2598          * the  page size, the remaining memory is zeroed when mapped, and
2599          * writes to that region are not written out to the file."
2600          */
2601         kaddr = kmap_atomic(page, KM_USER0);
2602         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2603         flush_dcache_page(page);
2604         kunmap_atomic(kaddr, KM_USER0);
2605         return __block_write_full_page(inode, page, get_block, wbc);
2606 }
2607
2608 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2609                             get_block_t *get_block)
2610 {
2611         struct buffer_head tmp;
2612         struct inode *inode = mapping->host;
2613         tmp.b_state = 0;
2614         tmp.b_blocknr = 0;
2615         get_block(inode, block, &tmp, 0);
2616         return tmp.b_blocknr;
2617 }
2618
2619 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2620 {
2621         struct buffer_head *bh = bio->bi_private;
2622
2623         if (bio->bi_size)
2624                 return 1;
2625
2626         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2627         bio_put(bio);
2628         return 0;
2629 }
2630
2631 int submit_bh(int rw, struct buffer_head * bh)
2632 {
2633         struct bio *bio;
2634
2635         BUG_ON(!buffer_locked(bh));
2636         BUG_ON(!buffer_mapped(bh));
2637         BUG_ON(!bh->b_end_io);
2638
2639         if ((rw == READ || rw == READA) && buffer_uptodate(bh))
2640                 buffer_error();
2641         if (rw == WRITE && !buffer_uptodate(bh))
2642                 buffer_error();
2643         if (rw == READ && buffer_dirty(bh))
2644                 buffer_error();
2645
2646         set_buffer_req(bh);
2647
2648         /*
2649          * from here on down, it's all bio -- do the initial mapping,
2650          * submit_bio -> generic_make_request may further map this bio around
2651          */
2652         bio = bio_alloc(GFP_NOIO, 1);
2653
2654         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2655         bio->bi_bdev = bh->b_bdev;
2656         bio->bi_io_vec[0].bv_page = bh->b_page;
2657         bio->bi_io_vec[0].bv_len = bh->b_size;
2658         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2659
2660         bio->bi_vcnt = 1;
2661         bio->bi_idx = 0;
2662         bio->bi_size = bh->b_size;
2663
2664         bio->bi_end_io = end_bio_bh_io_sync;
2665         bio->bi_private = bh;
2666
2667         return submit_bio(rw, bio);
2668 }
2669
2670 /**
2671  * ll_rw_block: low-level access to block devices (DEPRECATED)
2672  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2673  * @nr: number of &struct buffer_heads in the array
2674  * @bhs: array of pointers to &struct buffer_head
2675  *
2676  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2677  * and requests an I/O operation on them, either a %READ or a %WRITE.
2678  * The third %READA option is described in the documentation for
2679  * generic_make_request() which ll_rw_block() calls.
2680  *
2681  * This function drops any buffer that it cannot get a lock on (with the
2682  * BH_Lock state bit), any buffer that appears to be clean when doing a
2683  * write request, and any buffer that appears to be up-to-date when doing
2684  * read request.  Further it marks as clean buffers that are processed for
2685  * writing (the buffer cache won't assume that they are actually clean until
2686  * the buffer gets unlocked).
2687  *
2688  * ll_rw_block sets b_end_io to simple completion handler that marks
2689  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2690  * any waiters.
2691  *
2692  * All of the buffers must be for the same device, and must also be a
2693  * multiple of the current approved size for the device.
2694  */
2695 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2696 {
2697         int i;
2698
2699         for (i = 0; i < nr; i++) {
2700                 struct buffer_head *bh = bhs[i];
2701
2702                 if (test_set_buffer_locked(bh))
2703                         continue;
2704
2705                 get_bh(bh);
2706                 bh->b_end_io = end_buffer_io_sync;
2707                 if (rw == WRITE) {
2708                         if (test_clear_buffer_dirty(bh)) {
2709                                 submit_bh(WRITE, bh);
2710                                 continue;
2711                         }
2712                 } else {
2713                         if (!buffer_uptodate(bh)) {
2714                                 submit_bh(rw, bh);
2715                                 continue;
2716                         }
2717                 }
2718                 unlock_buffer(bh);
2719                 put_bh(bh);
2720         }
2721 }
2722
2723 /*
2724  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2725  * and then start new I/O and then wait upon it.
2726  */
2727 void sync_dirty_buffer(struct buffer_head *bh)
2728 {
2729         WARN_ON(atomic_read(&bh->b_count) < 1);
2730         lock_buffer(bh);
2731         if (test_clear_buffer_dirty(bh)) {
2732                 get_bh(bh);
2733                 bh->b_end_io = end_buffer_io_sync;
2734                 submit_bh(WRITE, bh);
2735                 wait_on_buffer(bh);
2736         } else {
2737                 unlock_buffer(bh);
2738         }
2739 }
2740
2741 /*
2742  * Sanity checks for try_to_free_buffers.
2743  */
2744 static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
2745 {
2746         if (!buffer_uptodate(bh) && !buffer_req(bh)) {
2747                 if (PageUptodate(page) && page->mapping
2748                         && buffer_mapped(bh)    /* discard_buffer */
2749                         && S_ISBLK(page->mapping->host->i_mode))
2750                 {
2751                         buffer_error();
2752                 }
2753         }
2754 }
2755
2756 /*
2757  * try_to_free_buffers() checks if all the buffers on this particular page
2758  * are unused, and releases them if so.
2759  *
2760  * Exclusion against try_to_free_buffers may be obtained by either
2761  * locking the page or by holding its mapping's private_lock.
2762  *
2763  * If the page is dirty but all the buffers are clean then we need to
2764  * be sure to mark the page clean as well.  This is because the page
2765  * may be against a block device, and a later reattachment of buffers
2766  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2767  * filesystem data on the same device.
2768  *
2769  * The same applies to regular filesystem pages: if all the buffers are
2770  * clean then we set the page clean and proceed.  To do that, we require
2771  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2772  * private_lock.
2773  *
2774  * try_to_free_buffers() is non-blocking.
2775  */
2776 static inline int buffer_busy(struct buffer_head *bh)
2777 {
2778         return atomic_read(&bh->b_count) |
2779                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2780 }
2781
2782 static int
2783 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2784 {
2785         struct buffer_head *head = page_buffers(page);
2786         struct buffer_head *bh;
2787         int was_uptodate = 1;
2788
2789         bh = head;
2790         do {
2791                 check_ttfb_buffer(page, bh);
2792                 if (buffer_busy(bh))
2793                         goto failed;
2794                 if (!buffer_uptodate(bh) && !buffer_req(bh))
2795                         was_uptodate = 0;
2796                 bh = bh->b_this_page;
2797         } while (bh != head);
2798
2799         if (!was_uptodate && PageUptodate(page))
2800                 buffer_error();
2801
2802         do {
2803                 struct buffer_head *next = bh->b_this_page;
2804
2805                 if (!list_empty(&bh->b_assoc_buffers))
2806                         __remove_assoc_queue(bh);
2807                 bh = next;
2808         } while (bh != head);
2809         *buffers_to_free = head;
2810         __clear_page_buffers(page);
2811         return 1;
2812 failed:
2813         return 0;
2814 }
2815
2816 int try_to_free_buffers(struct page *page)
2817 {
2818         struct address_space * const mapping = page->mapping;
2819         struct buffer_head *buffers_to_free = NULL;
2820         int ret = 0;
2821
2822         BUG_ON(!PageLocked(page));
2823         if (PageWriteback(page))
2824                 return 0;
2825
2826         if (mapping == NULL) {          /* swapped-in anon page */
2827                 ret = drop_buffers(page, &buffers_to_free);
2828                 goto out;
2829         }
2830
2831         spin_lock(&mapping->private_lock);
2832         ret = drop_buffers(page, &buffers_to_free);
2833         if (ret && !PageSwapCache(page)) {
2834                 /*
2835                  * If the filesystem writes its buffers by hand (eg ext3)
2836                  * then we can have clean buffers against a dirty page.  We
2837                  * clean the page here; otherwise later reattachment of buffers
2838                  * could encounter a non-uptodate page, which is unresolvable.
2839                  * This only applies in the rare case where try_to_free_buffers
2840                  * succeeds but the page is not freed.
2841                  */
2842                 clear_page_dirty(page);
2843         }
2844         spin_unlock(&mapping->private_lock);
2845 out:
2846         if (buffers_to_free) {
2847                 struct buffer_head *bh = buffers_to_free;
2848
2849                 do {
2850                         struct buffer_head *next = bh->b_this_page;
2851                         free_buffer_head(bh);
2852                         bh = next;
2853                 } while (bh != buffers_to_free);
2854         }
2855         return ret;
2856 }
2857 EXPORT_SYMBOL(try_to_free_buffers);
2858
2859 int block_sync_page(struct page *page)
2860 {
2861         blk_run_queues();
2862         return 0;
2863 }
2864
2865 /*
2866  * There are no bdflush tunables left.  But distributions are
2867  * still running obsolete flush daemons, so we terminate them here.
2868  *
2869  * Use of bdflush() is deprecated and will be removed in a future kernel.
2870  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2871  */
2872 asmlinkage long sys_bdflush(int func, long data)
2873 {
2874         static int msg_count;
2875
2876         if (!capable(CAP_SYS_ADMIN))
2877                 return -EPERM;
2878
2879         if (msg_count < 5) {
2880                 msg_count++;
2881                 printk(KERN_INFO
2882                         "warning: process `%s' used the obsolete bdflush"
2883                         " system call\n", current->comm);
2884                 printk(KERN_INFO "Fix your initscripts?\n");
2885         }
2886
2887         if (func == 1)
2888                 do_exit(0);
2889         return 0;
2890 }
2891
2892 /*
2893  * Buffer-head allocation
2894  */
2895 static kmem_cache_t *bh_cachep;
2896
2897 /*
2898  * Once the number of bh's in the machine exceeds this level, we start
2899  * stripping them in writeback.
2900  */
2901 static int max_buffer_heads;
2902
2903 int buffer_heads_over_limit;
2904
2905 struct bh_accounting {
2906         int nr;                 /* Number of live bh's */
2907         int ratelimit;          /* Limit cacheline bouncing */
2908 };
2909
2910 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2911
2912 static void recalc_bh_state(void)
2913 {
2914         int i;
2915         int tot = 0;
2916
2917         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2918                 return;
2919         __get_cpu_var(bh_accounting).ratelimit = 0;
2920         for (i = 0; i < NR_CPUS; i++) {
2921                 if (cpu_online(i))
2922                         tot += per_cpu(bh_accounting, i).nr;
2923         }
2924         buffer_heads_over_limit = (tot > max_buffer_heads);
2925 }
2926
2927 struct buffer_head *alloc_buffer_head(int gfp_flags)
2928 {
2929         struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2930         if (ret) {
2931                 preempt_disable();
2932                 __get_cpu_var(bh_accounting).nr++;
2933                 recalc_bh_state();
2934                 preempt_enable();
2935         }
2936         return ret;
2937 }
2938 EXPORT_SYMBOL(alloc_buffer_head);
2939
2940 void free_buffer_head(struct buffer_head *bh)
2941 {
2942         BUG_ON(!list_empty(&bh->b_assoc_buffers));
2943         kmem_cache_free(bh_cachep, bh);
2944         preempt_disable();
2945         __get_cpu_var(bh_accounting).nr--;
2946         recalc_bh_state();
2947         preempt_enable();
2948 }
2949 EXPORT_SYMBOL(free_buffer_head);
2950
2951 static void
2952 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
2953 {
2954         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2955                             SLAB_CTOR_CONSTRUCTOR) {
2956                 struct buffer_head * bh = (struct buffer_head *)data;
2957
2958                 memset(bh, 0, sizeof(*bh));
2959                 INIT_LIST_HEAD(&bh->b_assoc_buffers);
2960         }
2961 }
2962
2963 static void buffer_init_cpu(int cpu)
2964 {
2965         struct bh_accounting *bha = &per_cpu(bh_accounting, cpu);
2966         struct bh_lru *bhl = &per_cpu(bh_lrus, cpu);
2967
2968         bha->nr = 0;
2969         bha->ratelimit = 0;
2970         memset(bhl, 0, sizeof(*bhl));
2971 }
2972
2973 static int __devinit buffer_cpu_notify(struct notifier_block *self,
2974                                 unsigned long action, void *hcpu)
2975 {
2976         long cpu = (long)hcpu;
2977         switch(action) {
2978         case CPU_UP_PREPARE:
2979                 buffer_init_cpu(cpu);
2980                 break;
2981         default:
2982                 break;
2983         }
2984         return NOTIFY_OK;
2985 }
2986
2987 static struct notifier_block __devinitdata buffer_nb = {
2988         .notifier_call  = buffer_cpu_notify,
2989 };
2990
2991 void __init buffer_init(void)
2992 {
2993         int i;
2994         int nrpages;
2995
2996         bh_cachep = kmem_cache_create("buffer_head",
2997                         sizeof(struct buffer_head), 0,
2998                         0, init_buffer_head, NULL);
2999         for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3000                 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3001
3002         /*
3003          * Limit the bh occupancy to 10% of ZONE_NORMAL
3004          */
3005         nrpages = (nr_free_buffer_pages() * 10) / 100;
3006         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3007         buffer_cpu_notify(&buffer_nb, (unsigned long)CPU_UP_PREPARE,
3008                                 (void *)(long)smp_processor_id());
3009         register_cpu_notifier(&buffer_nb);
3010 }