Pull one more egcs 1.1.2 workaround.
[linux-2.6/linux-mips.git] / fs / buffer.c
bloba97e0dfe5f5df9fddb32104d37ee0bbd215ec382
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/mempool.h>
35 #include <linux/hash.h>
36 #include <linux/suspend.h>
37 #include <linux/buffer_head.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <asm/bitops.h>
42 static void invalidate_bh_lrus(void);
44 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
47 * Hashed waitqueue_head's for wait_on_buffer()
49 #define BH_WAIT_TABLE_ORDER 7
50 static struct bh_wait_queue_head {
51 wait_queue_head_t wqh;
52 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
55 * Debug/devel support stuff
58 void __buffer_error(char *file, int line)
60 static int enough;
62 if (enough > 10)
63 return;
64 enough++;
65 printk("buffer layer error at %s:%d\n", file, line);
66 printk("Pass this trace through ksymoops for reporting\n");
67 dump_stack();
69 EXPORT_SYMBOL(__buffer_error);
71 inline void
72 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
74 bh->b_end_io = handler;
75 bh->b_private = private;
79 * Return the address of the waitqueue_head to be used for this
80 * buffer_head
82 static wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
84 return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
88 * Wait on a buffer until someone does a wakeup on it. Needs
89 * lots of external locking. ext3 uses this. Fix it.
91 void sleep_on_buffer(struct buffer_head *bh)
93 wait_queue_head_t *wq = bh_waitq_head(bh);
94 sleep_on(wq);
96 EXPORT_SYMBOL(sleep_on_buffer);
98 void wake_up_buffer(struct buffer_head *bh)
100 wait_queue_head_t *wq = bh_waitq_head(bh);
102 if (waitqueue_active(wq))
103 wake_up_all(wq);
105 EXPORT_SYMBOL(wake_up_buffer);
107 void unlock_buffer(struct buffer_head *bh)
110 * unlock_buffer against a zero-count bh is a bug, if the page
111 * is not locked. Because then nothing protects the buffer's
112 * waitqueue, which is used here. (Well. Other locked buffers
113 * against the page will pin it. But complain anyway).
115 if (atomic_read(&bh->b_count) == 0 &&
116 !PageLocked(bh->b_page) &&
117 !PageWriteback(bh->b_page))
118 buffer_error();
120 clear_buffer_locked(bh);
121 smp_mb__after_clear_bit();
122 wake_up_buffer(bh);
126 * Block until a buffer comes unlocked. This doesn't stop it
127 * from becoming locked again - you have to lock it yourself
128 * if you want to preserve its state.
130 void __wait_on_buffer(struct buffer_head * bh)
132 wait_queue_head_t *wqh = bh_waitq_head(bh);
133 DEFINE_WAIT(wait);
135 get_bh(bh);
136 do {
137 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
138 blk_run_queues();
139 if (buffer_locked(bh))
140 schedule();
141 } while (buffer_locked(bh));
142 put_bh(bh);
143 finish_wait(wqh, &wait);
146 static void
147 __set_page_buffers(struct page *page, struct buffer_head *head)
149 if (page_has_buffers(page))
150 buffer_error();
151 page_cache_get(page);
152 SetPagePrivate(page);
153 page->private = (unsigned long)head;
156 static void
157 __clear_page_buffers(struct page *page)
159 ClearPagePrivate(page);
160 page->private = 0;
161 page_cache_release(page);
164 static void buffer_io_error(struct buffer_head *bh)
166 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
167 bdevname(bh->b_bdev),
168 (unsigned long long)bh->b_blocknr);
172 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
173 * unlock the buffer. This is what ll_rw_block uses too.
175 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
177 if (uptodate) {
178 set_buffer_uptodate(bh);
179 } else {
181 * This happens, due to failed READA attempts.
182 * buffer_io_error(bh);
184 clear_buffer_uptodate(bh);
186 unlock_buffer(bh);
187 put_bh(bh);
191 * Write out and wait upon all the dirty data associated with a block
192 * device via its mapping. Does not take the superblock lock.
194 int sync_blockdev(struct block_device *bdev)
196 int ret = 0;
198 if (bdev) {
199 int err;
201 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
202 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
203 if (!ret)
204 ret = err;
206 return ret;
208 EXPORT_SYMBOL(sync_blockdev);
211 * Write out and wait upon all dirty data associated with this
212 * superblock. Filesystem data as well as the underlying block
213 * device. Takes the superblock lock.
215 int fsync_super(struct super_block *sb)
217 sync_inodes_sb(sb, 0);
218 DQUOT_SYNC(sb);
219 lock_super(sb);
220 if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
221 sb->s_op->write_super(sb);
222 unlock_super(sb);
223 sync_blockdev(sb->s_bdev);
224 sync_inodes_sb(sb, 1);
226 return sync_blockdev(sb->s_bdev);
230 * Write out and wait upon all dirty data associated with this
231 * device. Filesystem data as well as the underlying block
232 * device. Takes the superblock lock.
234 int fsync_bdev(struct block_device *bdev)
236 struct super_block *sb = get_super(bdev);
237 if (sb) {
238 int res = fsync_super(sb);
239 drop_super(sb);
240 return res;
242 return sync_blockdev(bdev);
246 * sync everything. Start out by waking pdflush, because that writes back
247 * all queues in parallel.
249 asmlinkage long sys_sync(void)
251 wakeup_bdflush(0);
252 sync_inodes(0); /* All mappings and inodes, including block devices */
253 DQUOT_SYNC(NULL);
254 sync_supers(); /* Write the superblocks */
255 sync_inodes(1); /* All the mappings and inodes, again. */
256 return 0;
260 * Generic function to fsync a file.
262 * filp may be NULL if called via the msync of a vma.
265 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
267 struct inode * inode = dentry->d_inode;
268 struct super_block * sb;
269 int ret;
271 /* sync the inode to buffers */
272 write_inode_now(inode, 0);
274 /* sync the superblock to buffers */
275 sb = inode->i_sb;
276 lock_super(sb);
277 if (sb->s_op && sb->s_op->write_super)
278 sb->s_op->write_super(sb);
279 unlock_super(sb);
281 /* .. finally sync the buffers to disk */
282 ret = sync_blockdev(sb->s_bdev);
283 return ret;
286 asmlinkage long sys_fsync(unsigned int fd)
288 struct file * file;
289 struct dentry * dentry;
290 struct inode * inode;
291 int ret, err;
293 ret = -EBADF;
294 file = fget(fd);
295 if (!file)
296 goto out;
298 dentry = file->f_dentry;
299 inode = dentry->d_inode;
301 ret = -EINVAL;
302 if (!file->f_op || !file->f_op->fsync) {
303 /* Why? We can still call filemap_fdatawrite */
304 goto out_putf;
307 /* We need to protect against concurrent writers.. */
308 down(&inode->i_sem);
309 ret = filemap_fdatawrite(inode->i_mapping);
310 err = file->f_op->fsync(file, dentry, 0);
311 if (!ret)
312 ret = err;
313 err = filemap_fdatawait(inode->i_mapping);
314 if (!ret)
315 ret = err;
316 up(&inode->i_sem);
318 out_putf:
319 fput(file);
320 out:
321 return ret;
324 asmlinkage long sys_fdatasync(unsigned int fd)
326 struct file * file;
327 struct dentry * dentry;
328 struct inode * inode;
329 int ret, err;
331 ret = -EBADF;
332 file = fget(fd);
333 if (!file)
334 goto out;
336 dentry = file->f_dentry;
337 inode = dentry->d_inode;
339 ret = -EINVAL;
340 if (!file->f_op || !file->f_op->fsync)
341 goto out_putf;
343 down(&inode->i_sem);
344 ret = filemap_fdatawrite(inode->i_mapping);
345 err = file->f_op->fsync(file, dentry, 1);
346 if (!ret)
347 ret = err;
348 err = filemap_fdatawait(inode->i_mapping);
349 if (!ret)
350 ret = err;
351 up(&inode->i_sem);
353 out_putf:
354 fput(file);
355 out:
356 return ret;
360 * Various filesystems appear to want __find_get_block to be non-blocking.
361 * But it's the page lock which protects the buffers. To get around this,
362 * we get exclusion from try_to_free_buffers with the blockdev mapping's
363 * private_lock.
365 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
366 * may be quite high. This code could TryLock the page, and if that
367 * succeeds, there is no need to take private_lock. (But if
368 * private_lock is contended then so is mapping->page_lock).
370 static struct buffer_head *
371 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
373 struct inode *bd_inode = bdev->bd_inode;
374 struct address_space *bd_mapping = bd_inode->i_mapping;
375 struct buffer_head *ret = NULL;
376 unsigned long index;
377 struct buffer_head *bh;
378 struct buffer_head *head;
379 struct page *page;
381 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
382 page = find_get_page(bd_mapping, index);
383 if (!page)
384 goto out;
386 spin_lock(&bd_mapping->private_lock);
387 if (!page_has_buffers(page))
388 goto out_unlock;
389 head = page_buffers(page);
390 bh = head;
391 do {
392 if (bh->b_blocknr == block) {
393 ret = bh;
394 get_bh(bh);
395 goto out_unlock;
397 bh = bh->b_this_page;
398 } while (bh != head);
399 buffer_error();
400 out_unlock:
401 spin_unlock(&bd_mapping->private_lock);
402 page_cache_release(page);
403 out:
404 return ret;
407 /* If invalidate_buffers() will trash dirty buffers, it means some kind
408 of fs corruption is going on. Trashing dirty data always imply losing
409 information that was supposed to be just stored on the physical layer
410 by the user.
412 Thus invalidate_buffers in general usage is not allwowed to trash
413 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
414 be preserved. These buffers are simply skipped.
416 We also skip buffers which are still in use. For example this can
417 happen if a userspace program is reading the block device.
419 NOTE: In the case where the user removed a removable-media-disk even if
420 there's still dirty data not synced on disk (due a bug in the device driver
421 or due an error of the user), by not destroying the dirty buffers we could
422 generate corruption also on the next media inserted, thus a parameter is
423 necessary to handle this case in the most safe way possible (trying
424 to not corrupt also the new disk inserted with the data belonging to
425 the old now corrupted disk). Also for the ramdisk the natural thing
426 to do in order to release the ramdisk memory is to destroy dirty buffers.
428 These are two special cases. Normal usage imply the device driver
429 to issue a sync on the device (without waiting I/O completion) and
430 then an invalidate_buffers call that doesn't trash dirty buffers.
432 For handling cache coherency with the blkdev pagecache the 'update' case
433 is been introduced. It is needed to re-read from disk any pinned
434 buffer. NOTE: re-reading from disk is destructive so we can do it only
435 when we assume nobody is changing the buffercache under our I/O and when
436 we think the disk contains more recent information than the buffercache.
437 The update == 1 pass marks the buffers we need to update, the update == 2
438 pass does the actual I/O. */
439 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
441 invalidate_bh_lrus();
443 * FIXME: what about destroy_dirty_buffers?
444 * We really want to use invalidate_inode_pages2() for
445 * that, but not until that's cleaned up.
447 invalidate_inode_pages(bdev->bd_inode->i_mapping);
450 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
452 struct block_device *bdev = bdget(kdev_t_to_nr(dev));
453 if (bdev) {
454 invalidate_bdev(bdev, destroy_dirty_buffers);
455 bdput(bdev);
460 * Kick pdflush then try to free up some ZONE_NORMAL memory.
462 static void free_more_memory(void)
464 struct zone *zone;
465 pg_data_t *pgdat;
467 wakeup_bdflush(1024);
468 blk_run_queues();
469 yield();
471 for_each_pgdat(pgdat) {
472 zone = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
473 if (zone)
474 try_to_free_pages(zone, GFP_NOFS, 0);
479 * I/O completion handler for block_read_full_page() - pages
480 * which come unlocked at the end of I/O.
482 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
484 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
485 unsigned long flags;
486 struct buffer_head *tmp;
487 struct page *page;
488 int page_uptodate = 1;
490 BUG_ON(!buffer_async_read(bh));
492 page = bh->b_page;
493 if (uptodate) {
494 set_buffer_uptodate(bh);
495 } else {
496 clear_buffer_uptodate(bh);
497 buffer_io_error(bh);
498 SetPageError(page);
502 * Be _very_ careful from here on. Bad things can happen if
503 * two buffer heads end IO at almost the same time and both
504 * decide that the page is now completely done.
506 spin_lock_irqsave(&page_uptodate_lock, flags);
507 clear_buffer_async_read(bh);
508 unlock_buffer(bh);
509 tmp = bh;
510 do {
511 if (!buffer_uptodate(tmp))
512 page_uptodate = 0;
513 if (buffer_async_read(tmp)) {
514 BUG_ON(!buffer_locked(tmp));
515 goto still_busy;
517 tmp = tmp->b_this_page;
518 } while (tmp != bh);
519 spin_unlock_irqrestore(&page_uptodate_lock, flags);
522 * If none of the buffers had errors and they are all
523 * uptodate then we can set the page uptodate.
525 if (page_uptodate && !PageError(page))
526 SetPageUptodate(page);
527 unlock_page(page);
528 return;
530 still_busy:
531 spin_unlock_irqrestore(&page_uptodate_lock, flags);
532 return;
536 * Completion handler for block_write_full_page() - pages which are unlocked
537 * during I/O, and which have PageWriteback cleared upon I/O completion.
539 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
541 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
542 unsigned long flags;
543 struct buffer_head *tmp;
544 struct page *page;
546 BUG_ON(!buffer_async_write(bh));
548 page = bh->b_page;
549 if (uptodate) {
550 set_buffer_uptodate(bh);
551 } else {
552 buffer_io_error(bh);
553 clear_buffer_uptodate(bh);
554 SetPageError(page);
557 spin_lock_irqsave(&page_uptodate_lock, flags);
558 clear_buffer_async_write(bh);
559 unlock_buffer(bh);
560 tmp = bh->b_this_page;
561 while (tmp != bh) {
562 if (buffer_async_write(tmp)) {
563 BUG_ON(!buffer_locked(tmp));
564 goto still_busy;
566 tmp = tmp->b_this_page;
568 spin_unlock_irqrestore(&page_uptodate_lock, flags);
569 end_page_writeback(page);
570 return;
572 still_busy:
573 spin_unlock_irqrestore(&page_uptodate_lock, flags);
574 return;
578 * If a page's buffers are under async readin (end_buffer_async_read
579 * completion) then there is a possibility that another thread of
580 * control could lock one of the buffers after it has completed
581 * but while some of the other buffers have not completed. This
582 * locked buffer would confuse end_buffer_async_read() into not unlocking
583 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
584 * that this buffer is not under async I/O.
586 * The page comes unlocked when it has no locked buffer_async buffers
587 * left.
589 * PageLocked prevents anyone starting new async I/O reads any of
590 * the buffers.
592 * PageWriteback is used to prevent simultaneous writeout of the same
593 * page.
595 * PageLocked prevents anyone from starting writeback of a page which is
596 * under read I/O (PageWriteback is only ever set against a locked page).
598 void mark_buffer_async_read(struct buffer_head *bh)
600 bh->b_end_io = end_buffer_async_read;
601 set_buffer_async_read(bh);
603 EXPORT_SYMBOL(mark_buffer_async_read);
605 void mark_buffer_async_write(struct buffer_head *bh)
607 bh->b_end_io = end_buffer_async_write;
608 set_buffer_async_write(bh);
610 EXPORT_SYMBOL(mark_buffer_async_write);
614 * fs/buffer.c contains helper functions for buffer-backed address space's
615 * fsync functions. A common requirement for buffer-based filesystems is
616 * that certain data from the backing blockdev needs to be written out for
617 * a successful fsync(). For example, ext2 indirect blocks need to be
618 * written back and waited upon before fsync() returns.
620 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
621 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
622 * management of a list of dependent buffers at ->i_mapping->private_list.
624 * Locking is a little subtle: try_to_free_buffers() will remove buffers
625 * from their controlling inode's queue when they are being freed. But
626 * try_to_free_buffers() will be operating against the *blockdev* mapping
627 * at the time, not against the S_ISREG file which depends on those buffers.
628 * So the locking for private_list is via the private_lock in the address_space
629 * which backs the buffers. Which is different from the address_space
630 * against which the buffers are listed. So for a particular address_space,
631 * mapping->private_lock does *not* protect mapping->private_list! In fact,
632 * mapping->private_list will always be protected by the backing blockdev's
633 * ->private_lock.
635 * Which introduces a requirement: all buffers on an address_space's
636 * ->private_list must be from the same address_space: the blockdev's.
638 * address_spaces which do not place buffers at ->private_list via these
639 * utility functions are free to use private_lock and private_list for
640 * whatever they want. The only requirement is that list_empty(private_list)
641 * be true at clear_inode() time.
643 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
644 * filesystems should do that. invalidate_inode_buffers() should just go
645 * BUG_ON(!list_empty).
647 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
648 * take an address_space, not an inode. And it should be called
649 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
650 * queued up.
652 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
653 * list if it is already on a list. Because if the buffer is on a list,
654 * it *must* already be on the right one. If not, the filesystem is being
655 * silly. This will save a ton of locking. But first we have to ensure
656 * that buffers are taken *off* the old inode's list when they are freed
657 * (presumably in truncate). That requires careful auditing of all
658 * filesystems (do it inside bforget()). It could also be done by bringing
659 * b_inode back.
662 void buffer_insert_list(spinlock_t *lock,
663 struct buffer_head *bh, struct list_head *list)
665 spin_lock(lock);
666 list_del(&bh->b_assoc_buffers);
667 list_add(&bh->b_assoc_buffers, list);
668 spin_unlock(lock);
672 * The buffer's backing address_space's private_lock must be held
674 static inline void __remove_assoc_queue(struct buffer_head *bh)
676 list_del_init(&bh->b_assoc_buffers);
679 int inode_has_buffers(struct inode *inode)
681 return !list_empty(&inode->i_mapping->private_list);
685 * osync is designed to support O_SYNC io. It waits synchronously for
686 * all already-submitted IO to complete, but does not queue any new
687 * writes to the disk.
689 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
690 * you dirty the buffers, and then use osync_inode_buffers to wait for
691 * completion. Any other dirty buffers which are not yet queued for
692 * write will not be flushed to disk by the osync.
694 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
696 struct buffer_head *bh;
697 struct list_head *p;
698 int err = 0;
700 spin_lock(lock);
701 repeat:
702 list_for_each_prev(p, list) {
703 bh = BH_ENTRY(p);
704 if (buffer_locked(bh)) {
705 get_bh(bh);
706 spin_unlock(lock);
707 wait_on_buffer(bh);
708 if (!buffer_uptodate(bh))
709 err = -EIO;
710 brelse(bh);
711 spin_lock(lock);
712 goto repeat;
715 spin_unlock(lock);
716 return err;
720 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
721 * buffers
722 * @buffer_mapping - the mapping which backs the buffers' data
723 * @mapping - the mapping which wants those buffers written
725 * Starts I/O against the buffers at mapping->private_list, and waits upon
726 * that I/O.
728 * Basically, this is a convenience function for fsync(). @buffer_mapping is
729 * the blockdev which "owns" the buffers and @mapping is a file or directory
730 * which needs those buffers to be written for a successful fsync().
732 int sync_mapping_buffers(struct address_space *mapping)
734 struct address_space *buffer_mapping = mapping->assoc_mapping;
736 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
737 return 0;
739 return fsync_buffers_list(&buffer_mapping->private_lock,
740 &mapping->private_list);
742 EXPORT_SYMBOL(sync_mapping_buffers);
745 * Called when we've recently written block `bblock', and it is known that
746 * `bblock' was for a buffer_boundary() buffer. This means that the block at
747 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
748 * dirty, schedule it for IO. So that indirects merge nicely with their data.
750 void write_boundary_block(struct block_device *bdev,
751 sector_t bblock, unsigned blocksize)
753 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
754 if (bh) {
755 if (buffer_dirty(bh))
756 ll_rw_block(WRITE, 1, &bh);
757 put_bh(bh);
761 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
763 struct address_space *mapping = inode->i_mapping;
764 struct address_space *buffer_mapping = bh->b_page->mapping;
766 mark_buffer_dirty(bh);
767 if (!mapping->assoc_mapping) {
768 mapping->assoc_mapping = buffer_mapping;
769 } else {
770 if (mapping->assoc_mapping != buffer_mapping)
771 BUG();
773 if (list_empty(&bh->b_assoc_buffers))
774 buffer_insert_list(&buffer_mapping->private_lock,
775 bh, &mapping->private_list);
777 EXPORT_SYMBOL(mark_buffer_dirty_inode);
780 * Write out and wait upon a list of buffers.
782 * We have conflicting pressures: we want to make sure that all
783 * initially dirty buffers get waited on, but that any subsequently
784 * dirtied buffers don't. After all, we don't want fsync to last
785 * forever if somebody is actively writing to the file.
787 * Do this in two main stages: first we copy dirty buffers to a
788 * temporary inode list, queueing the writes as we go. Then we clean
789 * up, waiting for those writes to complete.
791 * During this second stage, any subsequent updates to the file may end
792 * up refiling the buffer on the original inode's dirty list again, so
793 * there is a chance we will end up with a buffer queued for write but
794 * not yet completed on that list. So, as a final cleanup we go through
795 * the osync code to catch these locked, dirty buffers without requeuing
796 * any newly dirty buffers for write.
798 int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
800 struct buffer_head *bh;
801 struct list_head tmp;
802 int err = 0, err2;
804 INIT_LIST_HEAD(&tmp);
806 spin_lock(lock);
807 while (!list_empty(list)) {
808 bh = BH_ENTRY(list->next);
809 list_del_init(&bh->b_assoc_buffers);
810 if (buffer_dirty(bh) || buffer_locked(bh)) {
811 list_add(&bh->b_assoc_buffers, &tmp);
812 if (buffer_dirty(bh)) {
813 get_bh(bh);
814 spin_unlock(lock);
816 * Ensure any pending I/O completes so that
817 * ll_rw_block() actually writes the current
818 * contents - it is a noop if I/O is still in
819 * flight on potentially older contents.
821 wait_on_buffer(bh);
822 ll_rw_block(WRITE, 1, &bh);
823 brelse(bh);
824 spin_lock(lock);
829 while (!list_empty(&tmp)) {
830 bh = BH_ENTRY(tmp.prev);
831 __remove_assoc_queue(bh);
832 get_bh(bh);
833 spin_unlock(lock);
834 wait_on_buffer(bh);
835 if (!buffer_uptodate(bh))
836 err = -EIO;
837 brelse(bh);
838 spin_lock(lock);
841 spin_unlock(lock);
842 err2 = osync_buffers_list(lock, list);
843 if (err)
844 return err;
845 else
846 return err2;
850 * Invalidate any and all dirty buffers on a given inode. We are
851 * probably unmounting the fs, but that doesn't mean we have already
852 * done a sync(). Just drop the buffers from the inode list.
854 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
855 * assumes that all the buffers are against the blockdev. Not true
856 * for reiserfs.
858 void invalidate_inode_buffers(struct inode *inode)
860 if (inode_has_buffers(inode)) {
861 struct address_space *mapping = inode->i_mapping;
862 struct list_head *list = &mapping->private_list;
863 struct address_space *buffer_mapping = mapping->assoc_mapping;
865 spin_lock(&buffer_mapping->private_lock);
866 while (!list_empty(list))
867 __remove_assoc_queue(BH_ENTRY(list->next));
868 spin_unlock(&buffer_mapping->private_lock);
873 * Create the appropriate buffers when given a page for data area and
874 * the size of each buffer.. Use the bh->b_this_page linked list to
875 * follow the buffers created. Return NULL if unable to create more
876 * buffers.
878 * The retry flag is used to differentiate async IO (paging, swapping)
879 * which may not fail from ordinary buffer allocations.
881 static struct buffer_head *
882 create_buffers(struct page * page, unsigned long size, int retry)
884 struct buffer_head *bh, *head;
885 long offset;
887 try_again:
888 head = NULL;
889 offset = PAGE_SIZE;
890 while ((offset -= size) >= 0) {
891 int pf_flags = current->flags;
893 current->flags |= PF_NOWARN;
894 bh = alloc_buffer_head();
895 current->flags = pf_flags;
896 if (!bh)
897 goto no_grow;
899 bh->b_bdev = NULL;
900 bh->b_this_page = head;
901 bh->b_blocknr = -1;
902 head = bh;
904 bh->b_state = 0;
905 atomic_set(&bh->b_count, 0);
906 bh->b_size = size;
908 /* Link the buffer to its page */
909 set_bh_page(bh, page, offset);
911 bh->b_end_io = NULL;
913 return head;
915 * In case anything failed, we just free everything we got.
917 no_grow:
918 if (head) {
919 do {
920 bh = head;
921 head = head->b_this_page;
922 free_buffer_head(bh);
923 } while (head);
927 * Return failure for non-async IO requests. Async IO requests
928 * are not allowed to fail, so we have to wait until buffer heads
929 * become available. But we don't want tasks sleeping with
930 * partially complete buffers, so all were released above.
932 if (!retry)
933 return NULL;
935 /* We're _really_ low on memory. Now we just
936 * wait for old buffer heads to become free due to
937 * finishing IO. Since this is an async request and
938 * the reserve list is empty, we're sure there are
939 * async buffer heads in use.
941 blk_run_queues();
943 free_more_memory();
944 goto try_again;
947 static inline void
948 link_dev_buffers(struct page *page, struct buffer_head *head)
950 struct buffer_head *bh, *tail;
952 bh = head;
953 do {
954 tail = bh;
955 bh = bh->b_this_page;
956 } while (bh);
957 tail->b_this_page = head;
958 __set_page_buffers(page, head);
962 * Initialise the state of a blockdev page's buffers.
964 static void
965 init_page_buffers(struct page *page, struct block_device *bdev,
966 int block, int size)
968 struct buffer_head *head = page_buffers(page);
969 struct buffer_head *bh = head;
970 unsigned int b_state;
972 b_state = 1 << BH_Mapped;
973 if (PageUptodate(page))
974 b_state |= 1 << BH_Uptodate;
976 do {
977 if (!(bh->b_state & (1 << BH_Mapped))) {
978 init_buffer(bh, NULL, NULL);
979 bh->b_bdev = bdev;
980 bh->b_blocknr = block;
981 bh->b_state = b_state;
983 block++;
984 bh = bh->b_this_page;
985 } while (bh != head);
989 * Create the page-cache page that contains the requested block.
991 * This is user purely for blockdev mappings.
993 static struct page *
994 grow_dev_page(struct block_device *bdev, unsigned long block,
995 unsigned long index, int size)
997 struct inode *inode = bdev->bd_inode;
998 struct page *page;
999 struct buffer_head *bh;
1001 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1002 if (!page)
1003 return NULL;
1005 if (!PageLocked(page))
1006 BUG();
1008 if (page_has_buffers(page)) {
1009 bh = page_buffers(page);
1010 if (bh->b_size == size)
1011 return page;
1012 if (!try_to_free_buffers(page))
1013 goto failed;
1017 * Allocate some buffers for this page
1019 bh = create_buffers(page, size, 0);
1020 if (!bh)
1021 goto failed;
1024 * Link the page to the buffers and initialise them. Take the
1025 * lock to be atomic wrt __find_get_block(), which does not
1026 * run under the page lock.
1028 spin_lock(&inode->i_mapping->private_lock);
1029 link_dev_buffers(page, bh);
1030 init_page_buffers(page, bdev, block, size);
1031 spin_unlock(&inode->i_mapping->private_lock);
1032 return page;
1034 failed:
1035 buffer_error();
1036 unlock_page(page);
1037 page_cache_release(page);
1038 return NULL;
1042 * Create buffers for the specified block device block's page. If
1043 * that page was dirty, the buffers are set dirty also.
1045 * Except that's a bug. Attaching dirty buffers to a dirty
1046 * blockdev's page can result in filesystem corruption, because
1047 * some of those buffers may be aliases of filesystem data.
1048 * grow_dev_page() will go BUG() if this happens.
1050 static inline int
1051 grow_buffers(struct block_device *bdev, unsigned long block, int size)
1053 struct page *page;
1054 unsigned long index;
1055 int sizebits;
1057 /* Size must be multiple of hard sectorsize */
1058 if (size & (bdev_hardsect_size(bdev)-1))
1059 BUG();
1060 if (size < 512 || size > PAGE_SIZE)
1061 BUG();
1063 sizebits = -1;
1064 do {
1065 sizebits++;
1066 } while ((size << sizebits) < PAGE_SIZE);
1068 index = block >> sizebits;
1069 block = index << sizebits;
1071 /* Create a page with the proper size buffers.. */
1072 page = grow_dev_page(bdev, block, index, size);
1073 if (!page)
1074 return 0;
1075 unlock_page(page);
1076 page_cache_release(page);
1077 return 1;
1080 struct buffer_head *
1081 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1083 for (;;) {
1084 struct buffer_head * bh;
1086 bh = __find_get_block(bdev, block, size);
1087 if (bh)
1088 return bh;
1090 if (!grow_buffers(bdev, block, size))
1091 free_more_memory();
1096 * The relationship between dirty buffers and dirty pages:
1098 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1099 * the page appears on its address_space.dirty_pages list.
1101 * At all times, the dirtiness of the buffers represents the dirtiness of
1102 * subsections of the page. If the page has buffers, the page dirty bit is
1103 * merely a hint about the true dirty state.
1105 * When a page is set dirty in its entirety, all its buffers are marked dirty
1106 * (if the page has buffers).
1108 * When a buffer is marked dirty, its page is dirtied, but the page's other
1109 * buffers are not.
1111 * Also. When blockdev buffers are explicitly read with bread(), they
1112 * individually become uptodate. But their backing page remains not
1113 * uptodate - even if all of its buffers are uptodate. A subsequent
1114 * block_read_full_page() against that page will discover all the uptodate
1115 * buffers, will set the page uptodate and will perform no I/O.
1119 * mark_buffer_dirty - mark a buffer_head as needing writeout
1121 * mark_buffer_dirty() will set the dirty bit against the buffer,
1122 * then set its backing page dirty, then attach the page to its
1123 * address_space's dirty_pages list and then attach the address_space's
1124 * inode to its superblock's dirty inode list.
1126 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1127 * mapping->page_lock and the global inode_lock.
1129 void mark_buffer_dirty(struct buffer_head *bh)
1131 if (!buffer_uptodate(bh))
1132 buffer_error();
1133 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1134 __set_page_dirty_nobuffers(bh->b_page);
1138 * Decrement a buffer_head's reference count. If all buffers against a page
1139 * have zero reference count, are clean and unlocked, and if the page is clean
1140 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1141 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1142 * a page but it ends up not being freed, and buffers may later be reattached).
1144 void __brelse(struct buffer_head * buf)
1146 if (atomic_read(&buf->b_count)) {
1147 put_bh(buf);
1148 return;
1150 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1151 buffer_error(); /* For the stack backtrace */
1155 * bforget() is like brelse(), except it discards any
1156 * potentially dirty data.
1158 void __bforget(struct buffer_head *bh)
1160 clear_buffer_dirty(bh);
1161 if (!list_empty(&bh->b_assoc_buffers)) {
1162 struct address_space *buffer_mapping = bh->b_page->mapping;
1164 spin_lock(&buffer_mapping->private_lock);
1165 list_del_init(&bh->b_assoc_buffers);
1166 spin_unlock(&buffer_mapping->private_lock);
1168 __brelse(bh);
1171 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1173 lock_buffer(bh);
1174 if (buffer_uptodate(bh)) {
1175 unlock_buffer(bh);
1176 return bh;
1177 } else {
1178 if (buffer_dirty(bh))
1179 buffer_error();
1180 get_bh(bh);
1181 bh->b_end_io = end_buffer_io_sync;
1182 submit_bh(READ, bh);
1183 wait_on_buffer(bh);
1184 if (buffer_uptodate(bh))
1185 return bh;
1187 brelse(bh);
1188 return NULL;
1192 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1193 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1194 * refcount elevated by one when they're in an LRU. A buffer can only appear
1195 * once in a particular CPU's LRU. A single buffer can be present in multiple
1196 * CPU's LRUs at the same time.
1198 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1199 * sb_find_get_block().
1201 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1202 * a local interrupt disable for that.
1205 #define BH_LRU_SIZE 8
1207 struct bh_lru {
1208 struct buffer_head *bhs[BH_LRU_SIZE];
1211 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}};
1213 #ifdef CONFIG_SMP
1214 #define bh_lru_lock() local_irq_disable()
1215 #define bh_lru_unlock() local_irq_enable()
1216 #else
1217 #define bh_lru_lock() preempt_disable()
1218 #define bh_lru_unlock() preempt_enable()
1219 #endif
1221 static inline void check_irqs_on(void)
1223 #ifdef irqs_disabled
1224 BUG_ON(irqs_disabled());
1225 #endif
1229 * The LRU management algorithm is dopey-but-simple. Sorry.
1231 static void bh_lru_install(struct buffer_head *bh)
1233 struct buffer_head *evictee = NULL;
1234 struct bh_lru *lru;
1236 check_irqs_on();
1237 bh_lru_lock();
1238 lru = &per_cpu(bh_lrus, smp_processor_id());
1239 if (lru->bhs[0] != bh) {
1240 struct buffer_head *bhs[BH_LRU_SIZE];
1241 int in;
1242 int out = 0;
1244 get_bh(bh);
1245 bhs[out++] = bh;
1246 for (in = 0; in < BH_LRU_SIZE; in++) {
1247 struct buffer_head *bh2 = lru->bhs[in];
1249 if (bh2 == bh) {
1250 __brelse(bh2);
1251 } else {
1252 if (out >= BH_LRU_SIZE) {
1253 BUG_ON(evictee != NULL);
1254 evictee = bh2;
1255 } else {
1256 bhs[out++] = bh2;
1260 while (out < BH_LRU_SIZE)
1261 bhs[out++] = NULL;
1262 memcpy(lru->bhs, bhs, sizeof(bhs));
1264 bh_lru_unlock();
1266 if (evictee)
1267 __brelse(evictee);
1271 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1273 static inline struct buffer_head *
1274 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1276 struct buffer_head *ret = NULL;
1277 struct bh_lru *lru;
1278 int i;
1280 check_irqs_on();
1281 bh_lru_lock();
1282 lru = &per_cpu(bh_lrus, smp_processor_id());
1283 for (i = 0; i < BH_LRU_SIZE; i++) {
1284 struct buffer_head *bh = lru->bhs[i];
1286 if (bh && bh->b_bdev == bdev &&
1287 bh->b_blocknr == block && bh->b_size == size) {
1288 if (i) {
1289 while (i) {
1290 lru->bhs[i] = lru->bhs[i - 1];
1291 i--;
1293 lru->bhs[0] = bh;
1295 get_bh(bh);
1296 ret = bh;
1297 break;
1300 bh_lru_unlock();
1301 return ret;
1305 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1306 * it in the LRU and mark it as accessed. If it is not present then return
1307 * NULL
1309 struct buffer_head *
1310 __find_get_block(struct block_device *bdev, sector_t block, int size)
1312 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1314 if (bh == NULL) {
1315 bh = __find_get_block_slow(bdev, block, size);
1316 if (bh)
1317 bh_lru_install(bh);
1319 if (bh)
1320 touch_buffer(bh);
1321 return bh;
1323 EXPORT_SYMBOL(__find_get_block);
1326 * __getblk will locate (and, if necessary, create) the buffer_head
1327 * which corresponds to the passed block_device, block and size. The
1328 * returned buffer has its reference count incremented.
1330 * __getblk() cannot fail - it just keeps trying. If you pass it an
1331 * illegal block number, __getblk() will happily return a buffer_head
1332 * which represents the non-existent block. Very weird.
1334 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1335 * attempt is failing. FIXME, perhaps?
1337 struct buffer_head *
1338 __getblk(struct block_device *bdev, sector_t block, int size)
1340 struct buffer_head *bh = __find_get_block(bdev, block, size);
1342 if (bh == NULL)
1343 bh = __getblk_slow(bdev, block, size);
1344 return bh;
1346 EXPORT_SYMBOL(__getblk);
1349 * __bread() - reads a specified block and returns the bh
1350 * @block: number of block
1351 * @size: size (in bytes) to read
1353 * Reads a specified block, and returns buffer head that contains it.
1354 * It returns NULL if the block was unreadable.
1356 struct buffer_head *
1357 __bread(struct block_device *bdev, sector_t block, int size)
1359 struct buffer_head *bh = __getblk(bdev, block, size);
1361 if (!buffer_uptodate(bh))
1362 bh = __bread_slow(bh);
1363 return bh;
1365 EXPORT_SYMBOL(__bread);
1368 * invalidate_bh_lrus() is called rarely - at unmount. Because it is only for
1369 * unmount it only needs to ensure that all buffers from the target device are
1370 * invalidated on return and it doesn't need to worry about new buffers from
1371 * that device being added - the unmount code has to prevent that.
1373 static void invalidate_bh_lru(void *arg)
1375 const int cpu = get_cpu();
1376 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
1377 int i;
1379 for (i = 0; i < BH_LRU_SIZE; i++) {
1380 brelse(b->bhs[i]);
1381 b->bhs[i] = NULL;
1383 put_cpu();
1386 static void invalidate_bh_lrus(void)
1388 preempt_disable();
1389 invalidate_bh_lru(NULL);
1390 smp_call_function(invalidate_bh_lru, NULL, 1, 1);
1391 preempt_enable();
1394 void set_bh_page(struct buffer_head *bh,
1395 struct page *page, unsigned long offset)
1397 bh->b_page = page;
1398 if (offset >= PAGE_SIZE)
1399 BUG();
1400 if (PageHighMem(page))
1402 * This catches illegal uses and preserves the offset:
1404 bh->b_data = (char *)(0 + offset);
1405 else
1406 bh->b_data = page_address(page) + offset;
1408 EXPORT_SYMBOL(set_bh_page);
1411 * Called when truncating a buffer on a page completely.
1413 static inline void discard_buffer(struct buffer_head * bh)
1415 lock_buffer(bh);
1416 clear_buffer_dirty(bh);
1417 bh->b_bdev = NULL;
1418 clear_buffer_mapped(bh);
1419 clear_buffer_req(bh);
1420 clear_buffer_new(bh);
1421 unlock_buffer(bh);
1425 * try_to_release_page() - release old fs-specific metadata on a page
1427 * @page: the page which the kernel is trying to free
1428 * @gfp_mask: memory allocation flags (and I/O mode)
1430 * The address_space is to try to release any data against the page
1431 * (presumably at page->private). If the release was successful, return `1'.
1432 * Otherwise return zero.
1434 * The @gfp_mask argument specifies whether I/O may be performed to release
1435 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1437 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1439 int try_to_release_page(struct page *page, int gfp_mask)
1441 struct address_space * const mapping = page->mapping;
1443 if (!PageLocked(page))
1444 BUG();
1445 if (PageWriteback(page))
1446 return 0;
1448 if (mapping && mapping->a_ops->releasepage)
1449 return mapping->a_ops->releasepage(page, gfp_mask);
1450 return try_to_free_buffers(page);
1454 * block_invalidatepage - invalidate part of all of a buffer-backed page
1456 * @page: the page which is affected
1457 * @offset: the index of the truncation point
1459 * block_invalidatepage() is called when all or part of the page has become
1460 * invalidatedby a truncate operation.
1462 * block_invalidatepage() does not have to release all buffers, but it must
1463 * ensure that no dirty buffer is left outside @offset and that no I/O
1464 * is underway against any of the blocks which are outside the truncation
1465 * point. Because the caller is about to free (and possibly reuse) those
1466 * blocks on-disk.
1468 int block_invalidatepage(struct page *page, unsigned long offset)
1470 struct buffer_head *head, *bh, *next;
1471 unsigned int curr_off = 0;
1472 int ret = 1;
1474 BUG_ON(!PageLocked(page));
1475 if (!page_has_buffers(page))
1476 goto out;
1478 head = page_buffers(page);
1479 bh = head;
1480 do {
1481 unsigned int next_off = curr_off + bh->b_size;
1482 next = bh->b_this_page;
1485 * is this block fully invalidated?
1487 if (offset <= curr_off)
1488 discard_buffer(bh);
1489 curr_off = next_off;
1490 bh = next;
1491 } while (bh != head);
1494 * We release buffers only if the entire page is being invalidated.
1495 * The get_block cached value has been unconditionally invalidated,
1496 * so real IO is not possible anymore.
1498 if (offset == 0)
1499 ret = try_to_release_page(page, 0);
1500 out:
1501 return ret;
1503 EXPORT_SYMBOL(block_invalidatepage);
1506 * We attach and possibly dirty the buffers atomically wrt
1507 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1508 * is already excluded via the page lock.
1510 void create_empty_buffers(struct page *page,
1511 unsigned long blocksize, unsigned long b_state)
1513 struct buffer_head *bh, *head, *tail;
1515 head = create_buffers(page, blocksize, 1);
1516 bh = head;
1517 do {
1518 bh->b_state |= b_state;
1519 tail = bh;
1520 bh = bh->b_this_page;
1521 } while (bh);
1522 tail->b_this_page = head;
1524 spin_lock(&page->mapping->private_lock);
1525 if (PageUptodate(page) || PageDirty(page)) {
1526 bh = head;
1527 do {
1528 if (PageDirty(page))
1529 set_buffer_dirty(bh);
1530 if (PageUptodate(page))
1531 set_buffer_uptodate(bh);
1532 bh = bh->b_this_page;
1533 } while (bh != head);
1535 __set_page_buffers(page, head);
1536 spin_unlock(&page->mapping->private_lock);
1538 EXPORT_SYMBOL(create_empty_buffers);
1541 * We are taking a block for data and we don't want any output from any
1542 * buffer-cache aliases starting from return from that function and
1543 * until the moment when something will explicitly mark the buffer
1544 * dirty (hopefully that will not happen until we will free that block ;-)
1545 * We don't even need to mark it not-uptodate - nobody can expect
1546 * anything from a newly allocated buffer anyway. We used to used
1547 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1548 * don't want to mark the alias unmapped, for example - it would confuse
1549 * anyone who might pick it with bread() afterwards...
1551 * Also.. Note that bforget() doesn't lock the buffer. So there can
1552 * be writeout I/O going on against recently-freed buffers. We don't
1553 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1554 * only if we really need to. That happens here.
1556 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1558 struct buffer_head *old_bh;
1560 old_bh = __find_get_block_slow(bdev, block, 0);
1561 if (old_bh) {
1562 #if 0 /* This happens. Later. */
1563 if (buffer_dirty(old_bh))
1564 buffer_error();
1565 #endif
1566 clear_buffer_dirty(old_bh);
1567 wait_on_buffer(old_bh);
1568 clear_buffer_req(old_bh);
1569 __brelse(old_bh);
1572 EXPORT_SYMBOL(unmap_underlying_metadata);
1575 * NOTE! All mapped/uptodate combinations are valid:
1577 * Mapped Uptodate Meaning
1579 * No No "unknown" - must do get_block()
1580 * No Yes "hole" - zero-filled
1581 * Yes No "allocated" - allocated on disk, not read in
1582 * Yes Yes "valid" - allocated and up-to-date in memory.
1584 * "Dirty" is valid only with the last case (mapped+uptodate).
1588 * While block_write_full_page is writing back the dirty buffers under
1589 * the page lock, whoever dirtied the buffers may decide to clean them
1590 * again at any time. We handle that by only looking at the buffer
1591 * state inside lock_buffer().
1593 * If block_write_full_page() is called for regular writeback
1594 * (called_for_sync() is false) then it will return -EAGAIN for a locked
1595 * buffer. This only can happen if someone has written the buffer directly,
1596 * with submit_bh(). At the address_space level PageWriteback prevents this
1597 * contention from occurring.
1599 static int __block_write_full_page(struct inode *inode,
1600 struct page *page, get_block_t *get_block)
1602 int err;
1603 int ret = 0;
1604 unsigned long block;
1605 unsigned long last_block;
1606 struct buffer_head *bh, *head;
1607 int nr_underway = 0;
1609 BUG_ON(!PageLocked(page));
1611 last_block = (inode->i_size - 1) >> inode->i_blkbits;
1613 if (!page_has_buffers(page)) {
1614 if (S_ISBLK(inode->i_mode))
1615 buffer_error();
1616 if (!PageUptodate(page))
1617 buffer_error();
1618 create_empty_buffers(page, 1 << inode->i_blkbits,
1619 (1 << BH_Dirty)|(1 << BH_Uptodate));
1623 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1624 * here, and the (potentially unmapped) buffers may become dirty at
1625 * any time. If a buffer becomes dirty here after we've inspected it
1626 * then we just miss that fact, and the page stays dirty.
1628 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1629 * handle that here by just cleaning them.
1632 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1633 head = page_buffers(page);
1634 bh = head;
1637 * Get all the dirty buffers mapped to disk addresses and
1638 * handle any aliases from the underlying blockdev's mapping.
1640 do {
1641 if (block > last_block) {
1643 * mapped buffers outside i_size will occur, because
1644 * this page can be outside i_size when there is a
1645 * truncate in progress.
1647 * if (buffer_mapped(bh))
1648 * buffer_error();
1651 * The buffer was zeroed by block_write_full_page()
1653 clear_buffer_dirty(bh);
1654 set_buffer_uptodate(bh);
1655 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1656 if (buffer_new(bh))
1657 buffer_error();
1658 err = get_block(inode, block, bh, 1);
1659 if (err)
1660 goto recover;
1661 if (buffer_new(bh)) {
1662 /* blockdev mappings never come here */
1663 clear_buffer_new(bh);
1664 unmap_underlying_metadata(bh->b_bdev,
1665 bh->b_blocknr);
1668 bh = bh->b_this_page;
1669 block++;
1670 } while (bh != head);
1672 do {
1673 get_bh(bh);
1674 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1675 if (called_for_sync()) {
1676 lock_buffer(bh);
1677 } else {
1678 if (test_set_buffer_locked(bh)) {
1679 ret = -EAGAIN;
1680 continue;
1683 if (test_clear_buffer_dirty(bh)) {
1684 if (!buffer_uptodate(bh))
1685 buffer_error();
1686 mark_buffer_async_write(bh);
1687 } else {
1688 unlock_buffer(bh);
1691 } while ((bh = bh->b_this_page) != head);
1693 BUG_ON(PageWriteback(page));
1694 SetPageWriteback(page); /* Keeps try_to_free_buffers() away */
1695 unlock_page(page);
1698 * The page may come unlocked any time after the *first* submit_bh()
1699 * call. Be careful with its buffers.
1701 do {
1702 struct buffer_head *next = bh->b_this_page;
1703 if (buffer_async_write(bh)) {
1704 submit_bh(WRITE, bh);
1705 nr_underway++;
1707 put_bh(bh);
1708 bh = next;
1709 } while (bh != head);
1711 err = 0;
1712 done:
1713 if (nr_underway == 0) {
1715 * The page was marked dirty, but the buffers were
1716 * clean. Someone wrote them back by hand with
1717 * ll_rw_block/submit_bh. A rare case.
1719 int uptodate = 1;
1720 do {
1721 if (!buffer_uptodate(bh)) {
1722 uptodate = 0;
1723 break;
1725 bh = bh->b_this_page;
1726 } while (bh != head);
1727 if (uptodate)
1728 SetPageUptodate(page);
1729 end_page_writeback(page);
1731 if (err == 0)
1732 return ret;
1733 return err;
1735 recover:
1737 * ENOSPC, or some other error. We may already have added some
1738 * blocks to the file, so we need to write these out to avoid
1739 * exposing stale data.
1740 * The page is currently locked and not marked for writeback
1742 ClearPageUptodate(page);
1743 bh = head;
1744 /* Recovery: lock and submit the mapped buffers */
1745 do {
1746 get_bh(bh);
1747 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1748 lock_buffer(bh);
1749 mark_buffer_async_write(bh);
1750 } else {
1752 * The buffer may have been set dirty during
1753 * attachment to a dirty page.
1755 clear_buffer_dirty(bh);
1757 } while ((bh = bh->b_this_page) != head);
1758 SetPageError(page);
1759 BUG_ON(PageWriteback(page));
1760 SetPageWriteback(page);
1761 unlock_page(page);
1762 do {
1763 struct buffer_head *next = bh->b_this_page;
1764 if (buffer_async_write(bh)) {
1765 clear_buffer_dirty(bh);
1766 submit_bh(WRITE, bh);
1767 nr_underway++;
1769 put_bh(bh);
1770 bh = next;
1771 } while (bh != head);
1772 goto done;
1775 static int __block_prepare_write(struct inode *inode, struct page *page,
1776 unsigned from, unsigned to, get_block_t *get_block)
1778 unsigned block_start, block_end;
1779 sector_t block;
1780 int err = 0;
1781 unsigned blocksize, bbits;
1782 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1784 BUG_ON(!PageLocked(page));
1785 BUG_ON(from > PAGE_CACHE_SIZE);
1786 BUG_ON(to > PAGE_CACHE_SIZE);
1787 BUG_ON(from > to);
1789 blocksize = 1 << inode->i_blkbits;
1790 if (!page_has_buffers(page))
1791 create_empty_buffers(page, blocksize, 0);
1792 head = page_buffers(page);
1794 bbits = inode->i_blkbits;
1795 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1797 for(bh = head, block_start = 0; bh != head || !block_start;
1798 block++, block_start=block_end, bh = bh->b_this_page) {
1799 block_end = block_start + blocksize;
1800 if (block_end <= from || block_start >= to) {
1801 if (PageUptodate(page)) {
1802 if (!buffer_uptodate(bh))
1803 set_buffer_uptodate(bh);
1805 continue;
1807 if (buffer_new(bh))
1808 clear_buffer_new(bh);
1809 if (!buffer_mapped(bh)) {
1810 err = get_block(inode, block, bh, 1);
1811 if (err)
1812 goto out;
1813 if (buffer_new(bh)) {
1814 clear_buffer_new(bh);
1815 unmap_underlying_metadata(bh->b_bdev,
1816 bh->b_blocknr);
1817 if (PageUptodate(page)) {
1818 if (!buffer_mapped(bh))
1819 buffer_error();
1820 set_buffer_uptodate(bh);
1821 continue;
1823 if (block_end > to || block_start < from) {
1824 void *kaddr;
1826 kaddr = kmap_atomic(page, KM_USER0);
1827 if (block_end > to)
1828 memset(kaddr+to, 0,
1829 block_end-to);
1830 if (block_start < from)
1831 memset(kaddr+block_start,
1832 0, from-block_start);
1833 flush_dcache_page(page);
1834 kunmap_atomic(kaddr, KM_USER0);
1836 continue;
1839 if (PageUptodate(page)) {
1840 if (!buffer_uptodate(bh))
1841 set_buffer_uptodate(bh);
1842 continue;
1844 if (!buffer_uptodate(bh) &&
1845 (block_start < from || block_end > to)) {
1846 ll_rw_block(READ, 1, &bh);
1847 *wait_bh++=bh;
1851 * If we issued read requests - let them complete.
1853 while(wait_bh > wait) {
1854 wait_on_buffer(*--wait_bh);
1855 if (!buffer_uptodate(*wait_bh))
1856 return -EIO;
1858 return 0;
1859 out:
1861 * Zero out any newly allocated blocks to avoid exposing stale
1862 * data. If BH_New is set, we know that the block was newly
1863 * allocated in the above loop.
1865 bh = head;
1866 block_start = 0;
1867 do {
1868 block_end = block_start+blocksize;
1869 if (block_end <= from)
1870 goto next_bh;
1871 if (block_start >= to)
1872 break;
1873 if (buffer_new(bh)) {
1874 void *kaddr;
1876 clear_buffer_new(bh);
1877 if (buffer_uptodate(bh))
1878 buffer_error();
1879 kaddr = kmap_atomic(page, KM_USER0);
1880 memset(kaddr+block_start, 0, bh->b_size);
1881 kunmap_atomic(kaddr, KM_USER0);
1882 set_buffer_uptodate(bh);
1883 mark_buffer_dirty(bh);
1885 next_bh:
1886 block_start = block_end;
1887 bh = bh->b_this_page;
1888 } while (bh != head);
1889 return err;
1892 static int __block_commit_write(struct inode *inode, struct page *page,
1893 unsigned from, unsigned to)
1895 unsigned block_start, block_end;
1896 int partial = 0;
1897 unsigned blocksize;
1898 struct buffer_head *bh, *head;
1900 blocksize = 1 << inode->i_blkbits;
1902 for(bh = head = page_buffers(page), block_start = 0;
1903 bh != head || !block_start;
1904 block_start=block_end, bh = bh->b_this_page) {
1905 block_end = block_start + blocksize;
1906 if (block_end <= from || block_start >= to) {
1907 if (!buffer_uptodate(bh))
1908 partial = 1;
1909 } else {
1910 set_buffer_uptodate(bh);
1911 mark_buffer_dirty(bh);
1916 * If this is a partial write which happened to make all buffers
1917 * uptodate then we can optimize away a bogus readpage() for
1918 * the next read(). Here we 'discover' whether the page went
1919 * uptodate as a result of this (potentially partial) write.
1921 if (!partial)
1922 SetPageUptodate(page);
1923 return 0;
1927 * Generic "read page" function for block devices that have the normal
1928 * get_block functionality. This is most of the block device filesystems.
1929 * Reads the page asynchronously --- the unlock_buffer() and
1930 * set/clear_buffer_uptodate() functions propagate buffer state into the
1931 * page struct once IO has completed.
1933 int block_read_full_page(struct page *page, get_block_t *get_block)
1935 struct inode *inode = page->mapping->host;
1936 sector_t iblock, lblock;
1937 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1938 unsigned int blocksize, blocks;
1939 int nr, i;
1941 if (!PageLocked(page))
1942 PAGE_BUG(page);
1943 if (PageUptodate(page))
1944 buffer_error();
1945 blocksize = 1 << inode->i_blkbits;
1946 if (!page_has_buffers(page))
1947 create_empty_buffers(page, blocksize, 0);
1948 head = page_buffers(page);
1950 blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1951 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1952 lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1953 bh = head;
1954 nr = 0;
1955 i = 0;
1957 do {
1958 if (buffer_uptodate(bh))
1959 continue;
1961 if (!buffer_mapped(bh)) {
1962 if (iblock < lblock) {
1963 if (get_block(inode, iblock, bh, 0))
1964 SetPageError(page);
1966 if (!buffer_mapped(bh)) {
1967 void *kaddr = kmap_atomic(page, KM_USER0);
1968 memset(kaddr + i * blocksize, 0, blocksize);
1969 flush_dcache_page(page);
1970 kunmap_atomic(kaddr, KM_USER0);
1971 set_buffer_uptodate(bh);
1972 continue;
1975 * get_block() might have updated the buffer
1976 * synchronously
1978 if (buffer_uptodate(bh))
1979 continue;
1981 arr[nr++] = bh;
1982 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1984 if (!nr) {
1986 * All buffers are uptodate - we can set the page uptodate
1987 * as well. But not if get_block() returned an error.
1989 if (!PageError(page))
1990 SetPageUptodate(page);
1991 unlock_page(page);
1992 return 0;
1995 /* Stage two: lock the buffers */
1996 for (i = 0; i < nr; i++) {
1997 bh = arr[i];
1998 lock_buffer(bh);
1999 mark_buffer_async_read(bh);
2003 * Stage 3: start the IO. Check for uptodateness
2004 * inside the buffer lock in case another process reading
2005 * the underlying blockdev brought it uptodate (the sct fix).
2007 for (i = 0; i < nr; i++) {
2008 bh = arr[i];
2009 if (buffer_uptodate(bh))
2010 end_buffer_async_read(bh, 1);
2011 else
2012 submit_bh(READ, bh);
2014 return 0;
2017 /* utility function for filesystems that need to do work on expanding
2018 * truncates. Uses prepare/commit_write to allow the filesystem to
2019 * deal with the hole.
2021 int generic_cont_expand(struct inode *inode, loff_t size)
2023 struct address_space *mapping = inode->i_mapping;
2024 struct page *page;
2025 unsigned long index, offset, limit;
2026 int err;
2028 err = -EFBIG;
2029 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2030 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2031 send_sig(SIGXFSZ, current, 0);
2032 goto out;
2034 if (size > inode->i_sb->s_maxbytes)
2035 goto out;
2037 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2039 /* ugh. in prepare/commit_write, if from==to==start of block, we
2040 ** skip the prepare. make sure we never send an offset for the start
2041 ** of a block
2043 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2044 offset++;
2046 index = size >> PAGE_CACHE_SHIFT;
2047 err = -ENOMEM;
2048 page = grab_cache_page(mapping, index);
2049 if (!page)
2050 goto out;
2051 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2052 if (!err) {
2053 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2055 unlock_page(page);
2056 page_cache_release(page);
2057 if (err > 0)
2058 err = 0;
2059 out:
2060 return err;
2064 * For moronic filesystems that do not allow holes in file.
2065 * We may have to extend the file.
2068 int cont_prepare_write(struct page *page, unsigned offset,
2069 unsigned to, get_block_t *get_block, loff_t *bytes)
2071 struct address_space *mapping = page->mapping;
2072 struct inode *inode = mapping->host;
2073 struct page *new_page;
2074 unsigned long pgpos;
2075 long status;
2076 unsigned zerofrom;
2077 unsigned blocksize = 1 << inode->i_blkbits;
2078 void *kaddr;
2080 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2081 status = -ENOMEM;
2082 new_page = grab_cache_page(mapping, pgpos);
2083 if (!new_page)
2084 goto out;
2085 /* we might sleep */
2086 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2087 unlock_page(new_page);
2088 page_cache_release(new_page);
2089 continue;
2091 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2092 if (zerofrom & (blocksize-1)) {
2093 *bytes |= (blocksize-1);
2094 (*bytes)++;
2096 status = __block_prepare_write(inode, new_page, zerofrom,
2097 PAGE_CACHE_SIZE, get_block);
2098 if (status)
2099 goto out_unmap;
2100 kaddr = kmap_atomic(new_page, KM_USER0);
2101 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2102 flush_dcache_page(new_page);
2103 kunmap_atomic(kaddr, KM_USER0);
2104 __block_commit_write(inode, new_page,
2105 zerofrom, PAGE_CACHE_SIZE);
2106 unlock_page(new_page);
2107 page_cache_release(new_page);
2110 if (page->index < pgpos) {
2111 /* completely inside the area */
2112 zerofrom = offset;
2113 } else {
2114 /* page covers the boundary, find the boundary offset */
2115 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2117 /* if we will expand the thing last block will be filled */
2118 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2119 *bytes |= (blocksize-1);
2120 (*bytes)++;
2123 /* starting below the boundary? Nothing to zero out */
2124 if (offset <= zerofrom)
2125 zerofrom = offset;
2127 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2128 if (status)
2129 goto out1;
2130 if (zerofrom < offset) {
2131 kaddr = kmap_atomic(page, KM_USER0);
2132 memset(kaddr+zerofrom, 0, offset-zerofrom);
2133 flush_dcache_page(page);
2134 kunmap_atomic(kaddr, KM_USER0);
2135 __block_commit_write(inode, page, zerofrom, offset);
2137 return 0;
2138 out1:
2139 ClearPageUptodate(page);
2140 return status;
2142 out_unmap:
2143 ClearPageUptodate(new_page);
2144 unlock_page(new_page);
2145 page_cache_release(new_page);
2146 out:
2147 return status;
2150 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2151 get_block_t *get_block)
2153 struct inode *inode = page->mapping->host;
2154 int err = __block_prepare_write(inode, page, from, to, get_block);
2155 if (err)
2156 ClearPageUptodate(page);
2157 return err;
2160 int block_commit_write(struct page *page, unsigned from, unsigned to)
2162 struct inode *inode = page->mapping->host;
2163 __block_commit_write(inode,page,from,to);
2164 return 0;
2167 int generic_commit_write(struct file *file, struct page *page,
2168 unsigned from, unsigned to)
2170 struct inode *inode = page->mapping->host;
2171 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2172 __block_commit_write(inode,page,from,to);
2173 if (pos > inode->i_size) {
2174 inode->i_size = pos;
2175 mark_inode_dirty(inode);
2177 return 0;
2180 int block_truncate_page(struct address_space *mapping,
2181 loff_t from, get_block_t *get_block)
2183 unsigned long index = from >> PAGE_CACHE_SHIFT;
2184 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2185 unsigned blocksize, iblock, length, pos;
2186 struct inode *inode = mapping->host;
2187 struct page *page;
2188 struct buffer_head *bh;
2189 void *kaddr;
2190 int err;
2192 blocksize = 1 << inode->i_blkbits;
2193 length = offset & (blocksize - 1);
2195 /* Block boundary? Nothing to do */
2196 if (!length)
2197 return 0;
2199 length = blocksize - length;
2200 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2202 page = grab_cache_page(mapping, index);
2203 err = -ENOMEM;
2204 if (!page)
2205 goto out;
2207 if (!page_has_buffers(page))
2208 create_empty_buffers(page, blocksize, 0);
2210 /* Find the buffer that contains "offset" */
2211 bh = page_buffers(page);
2212 pos = blocksize;
2213 while (offset >= pos) {
2214 bh = bh->b_this_page;
2215 iblock++;
2216 pos += blocksize;
2219 err = 0;
2220 if (!buffer_mapped(bh)) {
2221 err = get_block(inode, iblock, bh, 0);
2222 if (err)
2223 goto unlock;
2224 /* unmapped? It's a hole - nothing to do */
2225 if (!buffer_mapped(bh))
2226 goto unlock;
2229 /* Ok, it's mapped. Make sure it's up-to-date */
2230 if (PageUptodate(page))
2231 set_buffer_uptodate(bh);
2233 if (!buffer_uptodate(bh)) {
2234 err = -EIO;
2235 ll_rw_block(READ, 1, &bh);
2236 wait_on_buffer(bh);
2237 /* Uhhuh. Read error. Complain and punt. */
2238 if (!buffer_uptodate(bh))
2239 goto unlock;
2242 kaddr = kmap_atomic(page, KM_USER0);
2243 memset(kaddr + offset, 0, length);
2244 flush_dcache_page(page);
2245 kunmap_atomic(kaddr, KM_USER0);
2247 mark_buffer_dirty(bh);
2248 err = 0;
2250 unlock:
2251 unlock_page(page);
2252 page_cache_release(page);
2253 out:
2254 return err;
2258 * The generic ->writepage function for buffer-backed address_spaces
2260 int block_write_full_page(struct page *page, get_block_t *get_block)
2262 struct inode * const inode = page->mapping->host;
2263 const unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2264 unsigned offset;
2265 void *kaddr;
2267 /* Is the page fully inside i_size? */
2268 if (page->index < end_index)
2269 return __block_write_full_page(inode, page, get_block);
2271 /* Is the page fully outside i_size? (truncate in progress) */
2272 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2273 if (page->index >= end_index+1 || !offset) {
2274 unlock_page(page);
2275 return -EIO;
2279 * The page straddles i_size. It must be zeroed out on each and every
2280 * writepage invokation because it may be mmapped. "A file is mapped
2281 * in multiples of the page size. For a file that is not a multiple of
2282 * the page size, the remaining memory is zeroed when mapped, and
2283 * writes to that region are not written out to the file."
2285 kaddr = kmap_atomic(page, KM_USER0);
2286 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2287 flush_dcache_page(page);
2288 kunmap_atomic(kaddr, KM_USER0);
2289 return __block_write_full_page(inode, page, get_block);
2292 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2293 get_block_t *get_block)
2295 struct buffer_head tmp;
2296 struct inode *inode = mapping->host;
2297 tmp.b_state = 0;
2298 tmp.b_blocknr = 0;
2299 get_block(inode, block, &tmp, 0);
2300 return tmp.b_blocknr;
2303 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2305 struct buffer_head *bh = bio->bi_private;
2307 if (bio->bi_size)
2308 return 1;
2310 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2311 bio_put(bio);
2312 return 0;
2315 int submit_bh(int rw, struct buffer_head * bh)
2317 struct bio *bio;
2319 BUG_ON(!buffer_locked(bh));
2320 BUG_ON(!buffer_mapped(bh));
2321 BUG_ON(!bh->b_end_io);
2323 if ((rw == READ || rw == READA) && buffer_uptodate(bh))
2324 buffer_error();
2325 if (rw == WRITE && !buffer_uptodate(bh))
2326 buffer_error();
2327 if (rw == READ && buffer_dirty(bh))
2328 buffer_error();
2330 set_buffer_req(bh);
2333 * from here on down, it's all bio -- do the initial mapping,
2334 * submit_bio -> generic_make_request may further map this bio around
2336 bio = bio_alloc(GFP_NOIO, 1);
2338 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2339 bio->bi_bdev = bh->b_bdev;
2340 bio->bi_io_vec[0].bv_page = bh->b_page;
2341 bio->bi_io_vec[0].bv_len = bh->b_size;
2342 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2344 bio->bi_vcnt = 1;
2345 bio->bi_idx = 0;
2346 bio->bi_size = bh->b_size;
2348 bio->bi_end_io = end_bio_bh_io_sync;
2349 bio->bi_private = bh;
2351 return submit_bio(rw, bio);
2355 * ll_rw_block: low-level access to block devices (DEPRECATED)
2356 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2357 * @nr: number of &struct buffer_heads in the array
2358 * @bhs: array of pointers to &struct buffer_head
2360 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2361 * and requests an I/O operation on them, either a %READ or a %WRITE.
2362 * The third %READA option is described in the documentation for
2363 * generic_make_request() which ll_rw_block() calls.
2365 * This function drops any buffer that it cannot get a lock on (with the
2366 * BH_Lock state bit), any buffer that appears to be clean when doing a
2367 * write request, and any buffer that appears to be up-to-date when doing
2368 * read request. Further it marks as clean buffers that are processed for
2369 * writing (the buffer cache wont assume that they are actually clean until
2370 * the buffer gets unlocked).
2372 * ll_rw_block sets b_end_io to simple completion handler that marks
2373 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2374 * any waiters.
2376 * All of the buffers must be for the same device, and must also be a
2377 * multiple of the current approved size for the device.
2379 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2381 int i;
2383 for (i = 0; i < nr; i++) {
2384 struct buffer_head *bh = bhs[i];
2386 if (test_set_buffer_locked(bh))
2387 continue;
2389 get_bh(bh);
2390 bh->b_end_io = end_buffer_io_sync;
2391 if (rw == WRITE) {
2392 if (test_clear_buffer_dirty(bh)) {
2393 submit_bh(WRITE, bh);
2394 continue;
2396 } else {
2397 if (!buffer_uptodate(bh)) {
2398 submit_bh(rw, bh);
2399 continue;
2402 unlock_buffer(bh);
2403 put_bh(bh);
2408 * Sanity checks for try_to_free_buffers.
2410 static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
2412 if (!buffer_uptodate(bh)) {
2413 if (PageUptodate(page) && page->mapping
2414 && buffer_mapped(bh) /* discard_buffer */
2415 && S_ISBLK(page->mapping->host->i_mode))
2417 buffer_error();
2423 * try_to_free_buffers() checks if all the buffers on this particular page
2424 * are unused, and releases them if so.
2426 * Exclusion against try_to_free_buffers may be obtained by either
2427 * locking the page or by holding its mapping's private_lock.
2429 * If the page is dirty but all the buffers are clean then we need to
2430 * be sure to mark the page clean as well. This is because the page
2431 * may be against a block device, and a later reattachment of buffers
2432 * to a dirty page will set *all* buffers dirty. Which would corrupt
2433 * filesystem data on the same device.
2435 * The same applies to regular filesystem pages: if all the buffers are
2436 * clean then we set the page clean and proceed. To do that, we require
2437 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2438 * private_lock.
2440 * try_to_free_buffers() is non-blocking.
2442 static inline int buffer_busy(struct buffer_head *bh)
2444 return atomic_read(&bh->b_count) |
2445 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2448 static int
2449 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2451 struct buffer_head *head = page_buffers(page);
2452 struct buffer_head *bh;
2453 int was_uptodate = 1;
2455 bh = head;
2456 do {
2457 check_ttfb_buffer(page, bh);
2458 if (buffer_busy(bh))
2459 goto failed;
2460 if (!buffer_uptodate(bh))
2461 was_uptodate = 0;
2462 bh = bh->b_this_page;
2463 } while (bh != head);
2465 if (!was_uptodate && PageUptodate(page))
2466 buffer_error();
2468 do {
2469 struct buffer_head *next = bh->b_this_page;
2471 if (!list_empty(&bh->b_assoc_buffers))
2472 __remove_assoc_queue(bh);
2473 bh = next;
2474 } while (bh != head);
2475 *buffers_to_free = head;
2476 __clear_page_buffers(page);
2477 return 1;
2478 failed:
2479 return 0;
2482 int try_to_free_buffers(struct page *page)
2484 struct address_space * const mapping = page->mapping;
2485 struct buffer_head *buffers_to_free = NULL;
2486 int ret = 0;
2488 BUG_ON(!PageLocked(page));
2489 if (PageWriteback(page))
2490 return 0;
2492 if (mapping == NULL) { /* swapped-in anon page */
2493 ret = drop_buffers(page, &buffers_to_free);
2494 goto out;
2497 spin_lock(&mapping->private_lock);
2498 ret = drop_buffers(page, &buffers_to_free);
2499 if (ret && !PageSwapCache(page)) {
2501 * If the filesystem writes its buffers by hand (eg ext3)
2502 * then we can have clean buffers against a dirty page. We
2503 * clean the page here; otherwise later reattachment of buffers
2504 * could encounter a non-uptodate page, which is unresolvable.
2505 * This only applies in the rare case where try_to_free_buffers
2506 * succeeds but the page is not freed.
2508 clear_page_dirty(page);
2510 spin_unlock(&mapping->private_lock);
2511 out:
2512 if (buffers_to_free) {
2513 struct buffer_head *bh = buffers_to_free;
2515 do {
2516 struct buffer_head *next = bh->b_this_page;
2517 free_buffer_head(bh);
2518 bh = next;
2519 } while (bh != buffers_to_free);
2521 return ret;
2523 EXPORT_SYMBOL(try_to_free_buffers);
2525 int block_sync_page(struct page *page)
2527 blk_run_queues();
2528 return 0;
2532 * There are no bdflush tunables left. But distributions are
2533 * still running obsolete flush daemons, so we terminate them here.
2535 asmlinkage long sys_bdflush(int func, long data)
2537 if (!capable(CAP_SYS_ADMIN))
2538 return -EPERM;
2539 if (func == 1)
2540 do_exit(0);
2541 return 0;
2545 * Buffer-head allocation
2547 static kmem_cache_t *bh_cachep;
2548 static mempool_t *bh_mempool;
2551 * Once the number of bh's in the machine exceeds this level, we start
2552 * stripping them in writeback.
2554 static int max_buffer_heads;
2556 int buffer_heads_over_limit;
2558 struct bh_accounting {
2559 int nr; /* Number of live bh's */
2560 int ratelimit; /* Limit cacheline bouncing */
2563 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2565 static void recalc_bh_state(void)
2567 int i;
2568 int tot = 0;
2570 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2571 return;
2572 __get_cpu_var(bh_accounting).ratelimit = 0;
2573 for (i = 0; i < NR_CPUS; i++) {
2574 if (cpu_online(i))
2575 tot += per_cpu(bh_accounting, i).nr;
2577 buffer_heads_over_limit = (tot > max_buffer_heads);
2580 struct buffer_head *alloc_buffer_head(void)
2582 struct buffer_head *ret = mempool_alloc(bh_mempool, GFP_NOFS);
2583 if (ret) {
2584 preempt_disable();
2585 __get_cpu_var(bh_accounting).nr++;
2586 recalc_bh_state();
2587 preempt_enable();
2589 return ret;
2591 EXPORT_SYMBOL(alloc_buffer_head);
2593 void free_buffer_head(struct buffer_head *bh)
2595 BUG_ON(!list_empty(&bh->b_assoc_buffers));
2596 mempool_free(bh, bh_mempool);
2597 preempt_disable();
2598 __get_cpu_var(bh_accounting).nr--;
2599 recalc_bh_state();
2600 preempt_enable();
2602 EXPORT_SYMBOL(free_buffer_head);
2604 static void init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
2606 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2607 SLAB_CTOR_CONSTRUCTOR) {
2608 struct buffer_head * bh = (struct buffer_head *)data;
2610 memset(bh, 0, sizeof(*bh));
2611 INIT_LIST_HEAD(&bh->b_assoc_buffers);
2615 static void *bh_mempool_alloc(int gfp_mask, void *pool_data)
2617 return kmem_cache_alloc(bh_cachep, gfp_mask);
2620 static void bh_mempool_free(void *element, void *pool_data)
2622 return kmem_cache_free(bh_cachep, element);
2625 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
2626 #define MAX_UNUSED_BUFFERS NR_RESERVED+20
2628 static void buffer_init_cpu(int cpu)
2630 struct bh_accounting *bha = &per_cpu(bh_accounting, cpu);
2631 struct bh_lru *bhl = &per_cpu(bh_lrus, cpu);
2633 bha->nr = 0;
2634 bha->ratelimit = 0;
2635 memset(bhl, 0, sizeof(*bhl));
2638 static int __devinit buffer_cpu_notify(struct notifier_block *self,
2639 unsigned long action, void *hcpu)
2641 long cpu = (long)hcpu;
2642 switch(action) {
2643 case CPU_UP_PREPARE:
2644 buffer_init_cpu(cpu);
2645 break;
2646 default:
2647 break;
2649 return NOTIFY_OK;
2652 static struct notifier_block __devinitdata buffer_nb = {
2653 .notifier_call = buffer_cpu_notify,
2656 void __init buffer_init(void)
2658 int i;
2659 int nrpages;
2661 bh_cachep = kmem_cache_create("buffer_head",
2662 sizeof(struct buffer_head), 0,
2663 0, init_buffer_head, NULL);
2664 bh_mempool = mempool_create(MAX_UNUSED_BUFFERS, bh_mempool_alloc,
2665 bh_mempool_free, NULL);
2666 for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
2667 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
2670 * Limit the bh occupancy to 10% of ZONE_NORMAL
2672 nrpages = (nr_free_buffer_pages() * 10) / 100;
2673 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
2674 buffer_cpu_notify(&buffer_nb, (unsigned long)CPU_UP_PREPARE,
2675 (void *)(long)smp_processor_id());
2676 register_cpu_notifier(&buffer_nb);