Merge branches 'tracing/ftrace' and 'tracing/urgent' into tracing/core
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / fs / buffer.c
blob6569fda5cfed892c7e4ed53daa2e9677e60c36d5
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52 bh->b_end_io = handler;
53 bh->b_private = private;
56 static int sync_buffer(void *word)
58 struct block_device *bd;
59 struct buffer_head *bh
60 = container_of(word, struct buffer_head, b_state);
62 smp_mb();
63 bd = bh->b_bdev;
64 if (bd)
65 blk_run_address_space(bd->bd_inode->i_mapping);
66 io_schedule();
67 return 0;
70 void __lock_buffer(struct buffer_head *bh)
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 TASK_UNINTERRUPTIBLE);
75 EXPORT_SYMBOL(__lock_buffer);
77 void unlock_buffer(struct buffer_head *bh)
79 clear_bit_unlock(BH_Lock, &bh->b_state);
80 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock);
85 * Block until a buffer comes unlocked. This doesn't stop it
86 * from becoming locked again - you have to lock it yourself
87 * if you want to preserve its state.
89 void __wait_on_buffer(struct buffer_head * bh)
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
94 static void
95 __clear_page_buffers(struct page *page)
97 ClearPagePrivate(page);
98 set_page_private(page, 0);
99 page_cache_release(page);
102 static void buffer_io_error(struct buffer_head *bh)
104 char b[BDEVNAME_SIZE];
106 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 bdevname(bh->b_bdev, b),
108 (unsigned long long)bh->b_blocknr);
112 * End-of-IO handler helper function which does not touch the bh after
113 * unlocking it.
114 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
115 * a race there is benign: unlock_buffer() only use the bh's address for
116 * hashing after unlocking the buffer, so it doesn't actually touch the bh
117 * itself.
119 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
121 if (uptodate) {
122 set_buffer_uptodate(bh);
123 } else {
124 /* This happens, due to failed READA attempts. */
125 clear_buffer_uptodate(bh);
127 unlock_buffer(bh);
131 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
132 * unlock the buffer. This is what ll_rw_block uses too.
134 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
136 __end_buffer_read_notouch(bh, uptodate);
137 put_bh(bh);
140 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
142 char b[BDEVNAME_SIZE];
144 if (uptodate) {
145 set_buffer_uptodate(bh);
146 } else {
147 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
148 buffer_io_error(bh);
149 printk(KERN_WARNING "lost page write due to "
150 "I/O error on %s\n",
151 bdevname(bh->b_bdev, b));
153 set_buffer_write_io_error(bh);
154 clear_buffer_uptodate(bh);
156 unlock_buffer(bh);
157 put_bh(bh);
161 * Write out and wait upon all the dirty data associated with a block
162 * device via its mapping. Does not take the superblock lock.
164 int sync_blockdev(struct block_device *bdev)
166 int ret = 0;
168 if (bdev)
169 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
170 return ret;
172 EXPORT_SYMBOL(sync_blockdev);
175 * Write out and wait upon all dirty data associated with this
176 * device. Filesystem data as well as the underlying block
177 * device. Takes the superblock lock.
179 int fsync_bdev(struct block_device *bdev)
181 struct super_block *sb = get_super(bdev);
182 if (sb) {
183 int res = fsync_super(sb);
184 drop_super(sb);
185 return res;
187 return sync_blockdev(bdev);
191 * freeze_bdev -- lock a filesystem and force it into a consistent state
192 * @bdev: blockdevice to lock
194 * This takes the block device bd_mount_sem to make sure no new mounts
195 * happen on bdev until thaw_bdev() is called.
196 * If a superblock is found on this device, we take the s_umount semaphore
197 * on it to make sure nobody unmounts until the snapshot creation is done.
199 struct super_block *freeze_bdev(struct block_device *bdev)
201 struct super_block *sb;
203 down(&bdev->bd_mount_sem);
204 sb = get_super(bdev);
205 if (sb && !(sb->s_flags & MS_RDONLY)) {
206 sb->s_frozen = SB_FREEZE_WRITE;
207 smp_wmb();
209 __fsync_super(sb);
211 sb->s_frozen = SB_FREEZE_TRANS;
212 smp_wmb();
214 sync_blockdev(sb->s_bdev);
216 if (sb->s_op->write_super_lockfs)
217 sb->s_op->write_super_lockfs(sb);
220 sync_blockdev(bdev);
221 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
223 EXPORT_SYMBOL(freeze_bdev);
226 * thaw_bdev -- unlock filesystem
227 * @bdev: blockdevice to unlock
228 * @sb: associated superblock
230 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
232 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
234 if (sb) {
235 BUG_ON(sb->s_bdev != bdev);
237 if (sb->s_op->unlockfs)
238 sb->s_op->unlockfs(sb);
239 sb->s_frozen = SB_UNFROZEN;
240 smp_wmb();
241 wake_up(&sb->s_wait_unfrozen);
242 drop_super(sb);
245 up(&bdev->bd_mount_sem);
247 EXPORT_SYMBOL(thaw_bdev);
250 * Various filesystems appear to want __find_get_block to be non-blocking.
251 * But it's the page lock which protects the buffers. To get around this,
252 * we get exclusion from try_to_free_buffers with the blockdev mapping's
253 * private_lock.
255 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
256 * may be quite high. This code could TryLock the page, and if that
257 * succeeds, there is no need to take private_lock. (But if
258 * private_lock is contended then so is mapping->tree_lock).
260 static struct buffer_head *
261 __find_get_block_slow(struct block_device *bdev, sector_t block)
263 struct inode *bd_inode = bdev->bd_inode;
264 struct address_space *bd_mapping = bd_inode->i_mapping;
265 struct buffer_head *ret = NULL;
266 pgoff_t index;
267 struct buffer_head *bh;
268 struct buffer_head *head;
269 struct page *page;
270 int all_mapped = 1;
272 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
273 page = find_get_page(bd_mapping, index);
274 if (!page)
275 goto out;
277 spin_lock(&bd_mapping->private_lock);
278 if (!page_has_buffers(page))
279 goto out_unlock;
280 head = page_buffers(page);
281 bh = head;
282 do {
283 if (bh->b_blocknr == block) {
284 ret = bh;
285 get_bh(bh);
286 goto out_unlock;
288 if (!buffer_mapped(bh))
289 all_mapped = 0;
290 bh = bh->b_this_page;
291 } while (bh != head);
293 /* we might be here because some of the buffers on this page are
294 * not mapped. This is due to various races between
295 * file io on the block device and getblk. It gets dealt with
296 * elsewhere, don't buffer_error if we had some unmapped buffers
298 if (all_mapped) {
299 printk("__find_get_block_slow() failed. "
300 "block=%llu, b_blocknr=%llu\n",
301 (unsigned long long)block,
302 (unsigned long long)bh->b_blocknr);
303 printk("b_state=0x%08lx, b_size=%zu\n",
304 bh->b_state, bh->b_size);
305 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
307 out_unlock:
308 spin_unlock(&bd_mapping->private_lock);
309 page_cache_release(page);
310 out:
311 return ret;
314 /* If invalidate_buffers() will trash dirty buffers, it means some kind
315 of fs corruption is going on. Trashing dirty data always imply losing
316 information that was supposed to be just stored on the physical layer
317 by the user.
319 Thus invalidate_buffers in general usage is not allwowed to trash
320 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
321 be preserved. These buffers are simply skipped.
323 We also skip buffers which are still in use. For example this can
324 happen if a userspace program is reading the block device.
326 NOTE: In the case where the user removed a removable-media-disk even if
327 there's still dirty data not synced on disk (due a bug in the device driver
328 or due an error of the user), by not destroying the dirty buffers we could
329 generate corruption also on the next media inserted, thus a parameter is
330 necessary to handle this case in the most safe way possible (trying
331 to not corrupt also the new disk inserted with the data belonging to
332 the old now corrupted disk). Also for the ramdisk the natural thing
333 to do in order to release the ramdisk memory is to destroy dirty buffers.
335 These are two special cases. Normal usage imply the device driver
336 to issue a sync on the device (without waiting I/O completion) and
337 then an invalidate_buffers call that doesn't trash dirty buffers.
339 For handling cache coherency with the blkdev pagecache the 'update' case
340 is been introduced. It is needed to re-read from disk any pinned
341 buffer. NOTE: re-reading from disk is destructive so we can do it only
342 when we assume nobody is changing the buffercache under our I/O and when
343 we think the disk contains more recent information than the buffercache.
344 The update == 1 pass marks the buffers we need to update, the update == 2
345 pass does the actual I/O. */
346 void invalidate_bdev(struct block_device *bdev)
348 struct address_space *mapping = bdev->bd_inode->i_mapping;
350 if (mapping->nrpages == 0)
351 return;
353 invalidate_bh_lrus();
354 invalidate_mapping_pages(mapping, 0, -1);
358 * Kick pdflush then try to free up some ZONE_NORMAL memory.
360 static void free_more_memory(void)
362 struct zone *zone;
363 int nid;
365 wakeup_pdflush(1024);
366 yield();
368 for_each_online_node(nid) {
369 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
370 gfp_zone(GFP_NOFS), NULL,
371 &zone);
372 if (zone)
373 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
374 GFP_NOFS);
379 * I/O completion handler for block_read_full_page() - pages
380 * which come unlocked at the end of I/O.
382 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
384 unsigned long flags;
385 struct buffer_head *first;
386 struct buffer_head *tmp;
387 struct page *page;
388 int page_uptodate = 1;
390 BUG_ON(!buffer_async_read(bh));
392 page = bh->b_page;
393 if (uptodate) {
394 set_buffer_uptodate(bh);
395 } else {
396 clear_buffer_uptodate(bh);
397 if (printk_ratelimit())
398 buffer_io_error(bh);
399 SetPageError(page);
403 * Be _very_ careful from here on. Bad things can happen if
404 * two buffer heads end IO at almost the same time and both
405 * decide that the page is now completely done.
407 first = page_buffers(page);
408 local_irq_save(flags);
409 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
410 clear_buffer_async_read(bh);
411 unlock_buffer(bh);
412 tmp = bh;
413 do {
414 if (!buffer_uptodate(tmp))
415 page_uptodate = 0;
416 if (buffer_async_read(tmp)) {
417 BUG_ON(!buffer_locked(tmp));
418 goto still_busy;
420 tmp = tmp->b_this_page;
421 } while (tmp != bh);
422 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
423 local_irq_restore(flags);
426 * If none of the buffers had errors and they are all
427 * uptodate then we can set the page uptodate.
429 if (page_uptodate && !PageError(page))
430 SetPageUptodate(page);
431 unlock_page(page);
432 return;
434 still_busy:
435 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
436 local_irq_restore(flags);
437 return;
441 * Completion handler for block_write_full_page() - pages which are unlocked
442 * during I/O, and which have PageWriteback cleared upon I/O completion.
444 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
446 char b[BDEVNAME_SIZE];
447 unsigned long flags;
448 struct buffer_head *first;
449 struct buffer_head *tmp;
450 struct page *page;
452 BUG_ON(!buffer_async_write(bh));
454 page = bh->b_page;
455 if (uptodate) {
456 set_buffer_uptodate(bh);
457 } else {
458 if (printk_ratelimit()) {
459 buffer_io_error(bh);
460 printk(KERN_WARNING "lost page write due to "
461 "I/O error on %s\n",
462 bdevname(bh->b_bdev, b));
464 set_bit(AS_EIO, &page->mapping->flags);
465 set_buffer_write_io_error(bh);
466 clear_buffer_uptodate(bh);
467 SetPageError(page);
470 first = page_buffers(page);
471 local_irq_save(flags);
472 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
474 clear_buffer_async_write(bh);
475 unlock_buffer(bh);
476 tmp = bh->b_this_page;
477 while (tmp != bh) {
478 if (buffer_async_write(tmp)) {
479 BUG_ON(!buffer_locked(tmp));
480 goto still_busy;
482 tmp = tmp->b_this_page;
484 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
485 local_irq_restore(flags);
486 end_page_writeback(page);
487 return;
489 still_busy:
490 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
491 local_irq_restore(flags);
492 return;
496 * If a page's buffers are under async readin (end_buffer_async_read
497 * completion) then there is a possibility that another thread of
498 * control could lock one of the buffers after it has completed
499 * but while some of the other buffers have not completed. This
500 * locked buffer would confuse end_buffer_async_read() into not unlocking
501 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
502 * that this buffer is not under async I/O.
504 * The page comes unlocked when it has no locked buffer_async buffers
505 * left.
507 * PageLocked prevents anyone starting new async I/O reads any of
508 * the buffers.
510 * PageWriteback is used to prevent simultaneous writeout of the same
511 * page.
513 * PageLocked prevents anyone from starting writeback of a page which is
514 * under read I/O (PageWriteback is only ever set against a locked page).
516 static void mark_buffer_async_read(struct buffer_head *bh)
518 bh->b_end_io = end_buffer_async_read;
519 set_buffer_async_read(bh);
522 void mark_buffer_async_write(struct buffer_head *bh)
524 bh->b_end_io = end_buffer_async_write;
525 set_buffer_async_write(bh);
527 EXPORT_SYMBOL(mark_buffer_async_write);
531 * fs/buffer.c contains helper functions for buffer-backed address space's
532 * fsync functions. A common requirement for buffer-based filesystems is
533 * that certain data from the backing blockdev needs to be written out for
534 * a successful fsync(). For example, ext2 indirect blocks need to be
535 * written back and waited upon before fsync() returns.
537 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
538 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
539 * management of a list of dependent buffers at ->i_mapping->private_list.
541 * Locking is a little subtle: try_to_free_buffers() will remove buffers
542 * from their controlling inode's queue when they are being freed. But
543 * try_to_free_buffers() will be operating against the *blockdev* mapping
544 * at the time, not against the S_ISREG file which depends on those buffers.
545 * So the locking for private_list is via the private_lock in the address_space
546 * which backs the buffers. Which is different from the address_space
547 * against which the buffers are listed. So for a particular address_space,
548 * mapping->private_lock does *not* protect mapping->private_list! In fact,
549 * mapping->private_list will always be protected by the backing blockdev's
550 * ->private_lock.
552 * Which introduces a requirement: all buffers on an address_space's
553 * ->private_list must be from the same address_space: the blockdev's.
555 * address_spaces which do not place buffers at ->private_list via these
556 * utility functions are free to use private_lock and private_list for
557 * whatever they want. The only requirement is that list_empty(private_list)
558 * be true at clear_inode() time.
560 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
561 * filesystems should do that. invalidate_inode_buffers() should just go
562 * BUG_ON(!list_empty).
564 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
565 * take an address_space, not an inode. And it should be called
566 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
567 * queued up.
569 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
570 * list if it is already on a list. Because if the buffer is on a list,
571 * it *must* already be on the right one. If not, the filesystem is being
572 * silly. This will save a ton of locking. But first we have to ensure
573 * that buffers are taken *off* the old inode's list when they are freed
574 * (presumably in truncate). That requires careful auditing of all
575 * filesystems (do it inside bforget()). It could also be done by bringing
576 * b_inode back.
580 * The buffer's backing address_space's private_lock must be held
582 static void __remove_assoc_queue(struct buffer_head *bh)
584 list_del_init(&bh->b_assoc_buffers);
585 WARN_ON(!bh->b_assoc_map);
586 if (buffer_write_io_error(bh))
587 set_bit(AS_EIO, &bh->b_assoc_map->flags);
588 bh->b_assoc_map = NULL;
591 int inode_has_buffers(struct inode *inode)
593 return !list_empty(&inode->i_data.private_list);
597 * osync is designed to support O_SYNC io. It waits synchronously for
598 * all already-submitted IO to complete, but does not queue any new
599 * writes to the disk.
601 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
602 * you dirty the buffers, and then use osync_inode_buffers to wait for
603 * completion. Any other dirty buffers which are not yet queued for
604 * write will not be flushed to disk by the osync.
606 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
608 struct buffer_head *bh;
609 struct list_head *p;
610 int err = 0;
612 spin_lock(lock);
613 repeat:
614 list_for_each_prev(p, list) {
615 bh = BH_ENTRY(p);
616 if (buffer_locked(bh)) {
617 get_bh(bh);
618 spin_unlock(lock);
619 wait_on_buffer(bh);
620 if (!buffer_uptodate(bh))
621 err = -EIO;
622 brelse(bh);
623 spin_lock(lock);
624 goto repeat;
627 spin_unlock(lock);
628 return err;
632 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
633 * @mapping: the mapping which wants those buffers written
635 * Starts I/O against the buffers at mapping->private_list, and waits upon
636 * that I/O.
638 * Basically, this is a convenience function for fsync().
639 * @mapping is a file or directory which needs those buffers to be written for
640 * a successful fsync().
642 int sync_mapping_buffers(struct address_space *mapping)
644 struct address_space *buffer_mapping = mapping->assoc_mapping;
646 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
647 return 0;
649 return fsync_buffers_list(&buffer_mapping->private_lock,
650 &mapping->private_list);
652 EXPORT_SYMBOL(sync_mapping_buffers);
655 * Called when we've recently written block `bblock', and it is known that
656 * `bblock' was for a buffer_boundary() buffer. This means that the block at
657 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
658 * dirty, schedule it for IO. So that indirects merge nicely with their data.
660 void write_boundary_block(struct block_device *bdev,
661 sector_t bblock, unsigned blocksize)
663 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
664 if (bh) {
665 if (buffer_dirty(bh))
666 ll_rw_block(WRITE, 1, &bh);
667 put_bh(bh);
671 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
673 struct address_space *mapping = inode->i_mapping;
674 struct address_space *buffer_mapping = bh->b_page->mapping;
676 mark_buffer_dirty(bh);
677 if (!mapping->assoc_mapping) {
678 mapping->assoc_mapping = buffer_mapping;
679 } else {
680 BUG_ON(mapping->assoc_mapping != buffer_mapping);
682 if (!bh->b_assoc_map) {
683 spin_lock(&buffer_mapping->private_lock);
684 list_move_tail(&bh->b_assoc_buffers,
685 &mapping->private_list);
686 bh->b_assoc_map = mapping;
687 spin_unlock(&buffer_mapping->private_lock);
690 EXPORT_SYMBOL(mark_buffer_dirty_inode);
693 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
694 * dirty.
696 * If warn is true, then emit a warning if the page is not uptodate and has
697 * not been truncated.
699 static int __set_page_dirty(struct page *page,
700 struct address_space *mapping, int warn)
702 if (unlikely(!mapping))
703 return !TestSetPageDirty(page);
705 if (TestSetPageDirty(page))
706 return 0;
708 spin_lock_irq(&mapping->tree_lock);
709 if (page->mapping) { /* Race with truncate? */
710 WARN_ON_ONCE(warn && !PageUptodate(page));
712 if (mapping_cap_account_dirty(mapping)) {
713 __inc_zone_page_state(page, NR_FILE_DIRTY);
714 __inc_bdi_stat(mapping->backing_dev_info,
715 BDI_RECLAIMABLE);
716 task_io_account_write(PAGE_CACHE_SIZE);
718 radix_tree_tag_set(&mapping->page_tree,
719 page_index(page), PAGECACHE_TAG_DIRTY);
721 spin_unlock_irq(&mapping->tree_lock);
722 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
724 return 1;
728 * Add a page to the dirty page list.
730 * It is a sad fact of life that this function is called from several places
731 * deeply under spinlocking. It may not sleep.
733 * If the page has buffers, the uptodate buffers are set dirty, to preserve
734 * dirty-state coherency between the page and the buffers. It the page does
735 * not have buffers then when they are later attached they will all be set
736 * dirty.
738 * The buffers are dirtied before the page is dirtied. There's a small race
739 * window in which a writepage caller may see the page cleanness but not the
740 * buffer dirtiness. That's fine. If this code were to set the page dirty
741 * before the buffers, a concurrent writepage caller could clear the page dirty
742 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
743 * page on the dirty page list.
745 * We use private_lock to lock against try_to_free_buffers while using the
746 * page's buffer list. Also use this to protect against clean buffers being
747 * added to the page after it was set dirty.
749 * FIXME: may need to call ->reservepage here as well. That's rather up to the
750 * address_space though.
752 int __set_page_dirty_buffers(struct page *page)
754 struct address_space *mapping = page_mapping(page);
756 if (unlikely(!mapping))
757 return !TestSetPageDirty(page);
759 spin_lock(&mapping->private_lock);
760 if (page_has_buffers(page)) {
761 struct buffer_head *head = page_buffers(page);
762 struct buffer_head *bh = head;
764 do {
765 set_buffer_dirty(bh);
766 bh = bh->b_this_page;
767 } while (bh != head);
769 spin_unlock(&mapping->private_lock);
771 return __set_page_dirty(page, mapping, 1);
773 EXPORT_SYMBOL(__set_page_dirty_buffers);
776 * Write out and wait upon a list of buffers.
778 * We have conflicting pressures: we want to make sure that all
779 * initially dirty buffers get waited on, but that any subsequently
780 * dirtied buffers don't. After all, we don't want fsync to last
781 * forever if somebody is actively writing to the file.
783 * Do this in two main stages: first we copy dirty buffers to a
784 * temporary inode list, queueing the writes as we go. Then we clean
785 * up, waiting for those writes to complete.
787 * During this second stage, any subsequent updates to the file may end
788 * up refiling the buffer on the original inode's dirty list again, so
789 * there is a chance we will end up with a buffer queued for write but
790 * not yet completed on that list. So, as a final cleanup we go through
791 * the osync code to catch these locked, dirty buffers without requeuing
792 * any newly dirty buffers for write.
794 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
796 struct buffer_head *bh;
797 struct list_head tmp;
798 struct address_space *mapping;
799 int err = 0, err2;
801 INIT_LIST_HEAD(&tmp);
803 spin_lock(lock);
804 while (!list_empty(list)) {
805 bh = BH_ENTRY(list->next);
806 mapping = bh->b_assoc_map;
807 __remove_assoc_queue(bh);
808 /* Avoid race with mark_buffer_dirty_inode() which does
809 * a lockless check and we rely on seeing the dirty bit */
810 smp_mb();
811 if (buffer_dirty(bh) || buffer_locked(bh)) {
812 list_add(&bh->b_assoc_buffers, &tmp);
813 bh->b_assoc_map = mapping;
814 if (buffer_dirty(bh)) {
815 get_bh(bh);
816 spin_unlock(lock);
818 * Ensure any pending I/O completes so that
819 * ll_rw_block() actually writes the current
820 * contents - it is a noop if I/O is still in
821 * flight on potentially older contents.
823 ll_rw_block(SWRITE_SYNC, 1, &bh);
824 brelse(bh);
825 spin_lock(lock);
830 while (!list_empty(&tmp)) {
831 bh = BH_ENTRY(tmp.prev);
832 get_bh(bh);
833 mapping = bh->b_assoc_map;
834 __remove_assoc_queue(bh);
835 /* Avoid race with mark_buffer_dirty_inode() which does
836 * a lockless check and we rely on seeing the dirty bit */
837 smp_mb();
838 if (buffer_dirty(bh)) {
839 list_add(&bh->b_assoc_buffers,
840 &mapping->private_list);
841 bh->b_assoc_map = mapping;
843 spin_unlock(lock);
844 wait_on_buffer(bh);
845 if (!buffer_uptodate(bh))
846 err = -EIO;
847 brelse(bh);
848 spin_lock(lock);
851 spin_unlock(lock);
852 err2 = osync_buffers_list(lock, list);
853 if (err)
854 return err;
855 else
856 return err2;
860 * Invalidate any and all dirty buffers on a given inode. We are
861 * probably unmounting the fs, but that doesn't mean we have already
862 * done a sync(). Just drop the buffers from the inode list.
864 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
865 * assumes that all the buffers are against the blockdev. Not true
866 * for reiserfs.
868 void invalidate_inode_buffers(struct inode *inode)
870 if (inode_has_buffers(inode)) {
871 struct address_space *mapping = &inode->i_data;
872 struct list_head *list = &mapping->private_list;
873 struct address_space *buffer_mapping = mapping->assoc_mapping;
875 spin_lock(&buffer_mapping->private_lock);
876 while (!list_empty(list))
877 __remove_assoc_queue(BH_ENTRY(list->next));
878 spin_unlock(&buffer_mapping->private_lock);
883 * Remove any clean buffers from the inode's buffer list. This is called
884 * when we're trying to free the inode itself. Those buffers can pin it.
886 * Returns true if all buffers were removed.
888 int remove_inode_buffers(struct inode *inode)
890 int ret = 1;
892 if (inode_has_buffers(inode)) {
893 struct address_space *mapping = &inode->i_data;
894 struct list_head *list = &mapping->private_list;
895 struct address_space *buffer_mapping = mapping->assoc_mapping;
897 spin_lock(&buffer_mapping->private_lock);
898 while (!list_empty(list)) {
899 struct buffer_head *bh = BH_ENTRY(list->next);
900 if (buffer_dirty(bh)) {
901 ret = 0;
902 break;
904 __remove_assoc_queue(bh);
906 spin_unlock(&buffer_mapping->private_lock);
908 return ret;
912 * Create the appropriate buffers when given a page for data area and
913 * the size of each buffer.. Use the bh->b_this_page linked list to
914 * follow the buffers created. Return NULL if unable to create more
915 * buffers.
917 * The retry flag is used to differentiate async IO (paging, swapping)
918 * which may not fail from ordinary buffer allocations.
920 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
921 int retry)
923 struct buffer_head *bh, *head;
924 long offset;
926 try_again:
927 head = NULL;
928 offset = PAGE_SIZE;
929 while ((offset -= size) >= 0) {
930 bh = alloc_buffer_head(GFP_NOFS);
931 if (!bh)
932 goto no_grow;
934 bh->b_bdev = NULL;
935 bh->b_this_page = head;
936 bh->b_blocknr = -1;
937 head = bh;
939 bh->b_state = 0;
940 atomic_set(&bh->b_count, 0);
941 bh->b_private = NULL;
942 bh->b_size = size;
944 /* Link the buffer to its page */
945 set_bh_page(bh, page, offset);
947 init_buffer(bh, NULL, NULL);
949 return head;
951 * In case anything failed, we just free everything we got.
953 no_grow:
954 if (head) {
955 do {
956 bh = head;
957 head = head->b_this_page;
958 free_buffer_head(bh);
959 } while (head);
963 * Return failure for non-async IO requests. Async IO requests
964 * are not allowed to fail, so we have to wait until buffer heads
965 * become available. But we don't want tasks sleeping with
966 * partially complete buffers, so all were released above.
968 if (!retry)
969 return NULL;
971 /* We're _really_ low on memory. Now we just
972 * wait for old buffer heads to become free due to
973 * finishing IO. Since this is an async request and
974 * the reserve list is empty, we're sure there are
975 * async buffer heads in use.
977 free_more_memory();
978 goto try_again;
980 EXPORT_SYMBOL_GPL(alloc_page_buffers);
982 static inline void
983 link_dev_buffers(struct page *page, struct buffer_head *head)
985 struct buffer_head *bh, *tail;
987 bh = head;
988 do {
989 tail = bh;
990 bh = bh->b_this_page;
991 } while (bh);
992 tail->b_this_page = head;
993 attach_page_buffers(page, head);
997 * Initialise the state of a blockdev page's buffers.
999 static void
1000 init_page_buffers(struct page *page, struct block_device *bdev,
1001 sector_t block, int size)
1003 struct buffer_head *head = page_buffers(page);
1004 struct buffer_head *bh = head;
1005 int uptodate = PageUptodate(page);
1007 do {
1008 if (!buffer_mapped(bh)) {
1009 init_buffer(bh, NULL, NULL);
1010 bh->b_bdev = bdev;
1011 bh->b_blocknr = block;
1012 if (uptodate)
1013 set_buffer_uptodate(bh);
1014 set_buffer_mapped(bh);
1016 block++;
1017 bh = bh->b_this_page;
1018 } while (bh != head);
1022 * Create the page-cache page that contains the requested block.
1024 * This is user purely for blockdev mappings.
1026 static struct page *
1027 grow_dev_page(struct block_device *bdev, sector_t block,
1028 pgoff_t index, int size)
1030 struct inode *inode = bdev->bd_inode;
1031 struct page *page;
1032 struct buffer_head *bh;
1034 page = find_or_create_page(inode->i_mapping, index,
1035 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1036 if (!page)
1037 return NULL;
1039 BUG_ON(!PageLocked(page));
1041 if (page_has_buffers(page)) {
1042 bh = page_buffers(page);
1043 if (bh->b_size == size) {
1044 init_page_buffers(page, bdev, block, size);
1045 return page;
1047 if (!try_to_free_buffers(page))
1048 goto failed;
1052 * Allocate some buffers for this page
1054 bh = alloc_page_buffers(page, size, 0);
1055 if (!bh)
1056 goto failed;
1059 * Link the page to the buffers and initialise them. Take the
1060 * lock to be atomic wrt __find_get_block(), which does not
1061 * run under the page lock.
1063 spin_lock(&inode->i_mapping->private_lock);
1064 link_dev_buffers(page, bh);
1065 init_page_buffers(page, bdev, block, size);
1066 spin_unlock(&inode->i_mapping->private_lock);
1067 return page;
1069 failed:
1070 BUG();
1071 unlock_page(page);
1072 page_cache_release(page);
1073 return NULL;
1077 * Create buffers for the specified block device block's page. If
1078 * that page was dirty, the buffers are set dirty also.
1080 static int
1081 grow_buffers(struct block_device *bdev, sector_t block, int size)
1083 struct page *page;
1084 pgoff_t index;
1085 int sizebits;
1087 sizebits = -1;
1088 do {
1089 sizebits++;
1090 } while ((size << sizebits) < PAGE_SIZE);
1092 index = block >> sizebits;
1095 * Check for a block which wants to lie outside our maximum possible
1096 * pagecache index. (this comparison is done using sector_t types).
1098 if (unlikely(index != block >> sizebits)) {
1099 char b[BDEVNAME_SIZE];
1101 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1102 "device %s\n",
1103 __func__, (unsigned long long)block,
1104 bdevname(bdev, b));
1105 return -EIO;
1107 block = index << sizebits;
1108 /* Create a page with the proper size buffers.. */
1109 page = grow_dev_page(bdev, block, index, size);
1110 if (!page)
1111 return 0;
1112 unlock_page(page);
1113 page_cache_release(page);
1114 return 1;
1117 static struct buffer_head *
1118 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1120 /* Size must be multiple of hard sectorsize */
1121 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1122 (size < 512 || size > PAGE_SIZE))) {
1123 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1124 size);
1125 printk(KERN_ERR "hardsect size: %d\n",
1126 bdev_hardsect_size(bdev));
1128 dump_stack();
1129 return NULL;
1132 for (;;) {
1133 struct buffer_head * bh;
1134 int ret;
1136 bh = __find_get_block(bdev, block, size);
1137 if (bh)
1138 return bh;
1140 ret = grow_buffers(bdev, block, size);
1141 if (ret < 0)
1142 return NULL;
1143 if (ret == 0)
1144 free_more_memory();
1149 * The relationship between dirty buffers and dirty pages:
1151 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1152 * the page is tagged dirty in its radix tree.
1154 * At all times, the dirtiness of the buffers represents the dirtiness of
1155 * subsections of the page. If the page has buffers, the page dirty bit is
1156 * merely a hint about the true dirty state.
1158 * When a page is set dirty in its entirety, all its buffers are marked dirty
1159 * (if the page has buffers).
1161 * When a buffer is marked dirty, its page is dirtied, but the page's other
1162 * buffers are not.
1164 * Also. When blockdev buffers are explicitly read with bread(), they
1165 * individually become uptodate. But their backing page remains not
1166 * uptodate - even if all of its buffers are uptodate. A subsequent
1167 * block_read_full_page() against that page will discover all the uptodate
1168 * buffers, will set the page uptodate and will perform no I/O.
1172 * mark_buffer_dirty - mark a buffer_head as needing writeout
1173 * @bh: the buffer_head to mark dirty
1175 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1176 * backing page dirty, then tag the page as dirty in its address_space's radix
1177 * tree and then attach the address_space's inode to its superblock's dirty
1178 * inode list.
1180 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1181 * mapping->tree_lock and the global inode_lock.
1183 void mark_buffer_dirty(struct buffer_head *bh)
1185 WARN_ON_ONCE(!buffer_uptodate(bh));
1188 * Very *carefully* optimize the it-is-already-dirty case.
1190 * Don't let the final "is it dirty" escape to before we
1191 * perhaps modified the buffer.
1193 if (buffer_dirty(bh)) {
1194 smp_mb();
1195 if (buffer_dirty(bh))
1196 return;
1199 if (!test_set_buffer_dirty(bh))
1200 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1204 * Decrement a buffer_head's reference count. If all buffers against a page
1205 * have zero reference count, are clean and unlocked, and if the page is clean
1206 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1207 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1208 * a page but it ends up not being freed, and buffers may later be reattached).
1210 void __brelse(struct buffer_head * buf)
1212 if (atomic_read(&buf->b_count)) {
1213 put_bh(buf);
1214 return;
1216 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1220 * bforget() is like brelse(), except it discards any
1221 * potentially dirty data.
1223 void __bforget(struct buffer_head *bh)
1225 clear_buffer_dirty(bh);
1226 if (bh->b_assoc_map) {
1227 struct address_space *buffer_mapping = bh->b_page->mapping;
1229 spin_lock(&buffer_mapping->private_lock);
1230 list_del_init(&bh->b_assoc_buffers);
1231 bh->b_assoc_map = NULL;
1232 spin_unlock(&buffer_mapping->private_lock);
1234 __brelse(bh);
1237 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1239 lock_buffer(bh);
1240 if (buffer_uptodate(bh)) {
1241 unlock_buffer(bh);
1242 return bh;
1243 } else {
1244 get_bh(bh);
1245 bh->b_end_io = end_buffer_read_sync;
1246 submit_bh(READ, bh);
1247 wait_on_buffer(bh);
1248 if (buffer_uptodate(bh))
1249 return bh;
1251 brelse(bh);
1252 return NULL;
1256 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1257 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1258 * refcount elevated by one when they're in an LRU. A buffer can only appear
1259 * once in a particular CPU's LRU. A single buffer can be present in multiple
1260 * CPU's LRUs at the same time.
1262 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1263 * sb_find_get_block().
1265 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1266 * a local interrupt disable for that.
1269 #define BH_LRU_SIZE 8
1271 struct bh_lru {
1272 struct buffer_head *bhs[BH_LRU_SIZE];
1275 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1277 #ifdef CONFIG_SMP
1278 #define bh_lru_lock() local_irq_disable()
1279 #define bh_lru_unlock() local_irq_enable()
1280 #else
1281 #define bh_lru_lock() preempt_disable()
1282 #define bh_lru_unlock() preempt_enable()
1283 #endif
1285 static inline void check_irqs_on(void)
1287 #ifdef irqs_disabled
1288 BUG_ON(irqs_disabled());
1289 #endif
1293 * The LRU management algorithm is dopey-but-simple. Sorry.
1295 static void bh_lru_install(struct buffer_head *bh)
1297 struct buffer_head *evictee = NULL;
1298 struct bh_lru *lru;
1300 check_irqs_on();
1301 bh_lru_lock();
1302 lru = &__get_cpu_var(bh_lrus);
1303 if (lru->bhs[0] != bh) {
1304 struct buffer_head *bhs[BH_LRU_SIZE];
1305 int in;
1306 int out = 0;
1308 get_bh(bh);
1309 bhs[out++] = bh;
1310 for (in = 0; in < BH_LRU_SIZE; in++) {
1311 struct buffer_head *bh2 = lru->bhs[in];
1313 if (bh2 == bh) {
1314 __brelse(bh2);
1315 } else {
1316 if (out >= BH_LRU_SIZE) {
1317 BUG_ON(evictee != NULL);
1318 evictee = bh2;
1319 } else {
1320 bhs[out++] = bh2;
1324 while (out < BH_LRU_SIZE)
1325 bhs[out++] = NULL;
1326 memcpy(lru->bhs, bhs, sizeof(bhs));
1328 bh_lru_unlock();
1330 if (evictee)
1331 __brelse(evictee);
1335 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1337 static struct buffer_head *
1338 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1340 struct buffer_head *ret = NULL;
1341 struct bh_lru *lru;
1342 unsigned int i;
1344 check_irqs_on();
1345 bh_lru_lock();
1346 lru = &__get_cpu_var(bh_lrus);
1347 for (i = 0; i < BH_LRU_SIZE; i++) {
1348 struct buffer_head *bh = lru->bhs[i];
1350 if (bh && bh->b_bdev == bdev &&
1351 bh->b_blocknr == block && bh->b_size == size) {
1352 if (i) {
1353 while (i) {
1354 lru->bhs[i] = lru->bhs[i - 1];
1355 i--;
1357 lru->bhs[0] = bh;
1359 get_bh(bh);
1360 ret = bh;
1361 break;
1364 bh_lru_unlock();
1365 return ret;
1369 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1370 * it in the LRU and mark it as accessed. If it is not present then return
1371 * NULL
1373 struct buffer_head *
1374 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1376 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1378 if (bh == NULL) {
1379 bh = __find_get_block_slow(bdev, block);
1380 if (bh)
1381 bh_lru_install(bh);
1383 if (bh)
1384 touch_buffer(bh);
1385 return bh;
1387 EXPORT_SYMBOL(__find_get_block);
1390 * __getblk will locate (and, if necessary, create) the buffer_head
1391 * which corresponds to the passed block_device, block and size. The
1392 * returned buffer has its reference count incremented.
1394 * __getblk() cannot fail - it just keeps trying. If you pass it an
1395 * illegal block number, __getblk() will happily return a buffer_head
1396 * which represents the non-existent block. Very weird.
1398 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1399 * attempt is failing. FIXME, perhaps?
1401 struct buffer_head *
1402 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1404 struct buffer_head *bh = __find_get_block(bdev, block, size);
1406 might_sleep();
1407 if (bh == NULL)
1408 bh = __getblk_slow(bdev, block, size);
1409 return bh;
1411 EXPORT_SYMBOL(__getblk);
1414 * Do async read-ahead on a buffer..
1416 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1418 struct buffer_head *bh = __getblk(bdev, block, size);
1419 if (likely(bh)) {
1420 ll_rw_block(READA, 1, &bh);
1421 brelse(bh);
1424 EXPORT_SYMBOL(__breadahead);
1427 * __bread() - reads a specified block and returns the bh
1428 * @bdev: the block_device to read from
1429 * @block: number of block
1430 * @size: size (in bytes) to read
1432 * Reads a specified block, and returns buffer head that contains it.
1433 * It returns NULL if the block was unreadable.
1435 struct buffer_head *
1436 __bread(struct block_device *bdev, sector_t block, unsigned size)
1438 struct buffer_head *bh = __getblk(bdev, block, size);
1440 if (likely(bh) && !buffer_uptodate(bh))
1441 bh = __bread_slow(bh);
1442 return bh;
1444 EXPORT_SYMBOL(__bread);
1447 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1448 * This doesn't race because it runs in each cpu either in irq
1449 * or with preempt disabled.
1451 static void invalidate_bh_lru(void *arg)
1453 struct bh_lru *b = &get_cpu_var(bh_lrus);
1454 int i;
1456 for (i = 0; i < BH_LRU_SIZE; i++) {
1457 brelse(b->bhs[i]);
1458 b->bhs[i] = NULL;
1460 put_cpu_var(bh_lrus);
1463 void invalidate_bh_lrus(void)
1465 on_each_cpu(invalidate_bh_lru, NULL, 1);
1467 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1469 void set_bh_page(struct buffer_head *bh,
1470 struct page *page, unsigned long offset)
1472 bh->b_page = page;
1473 BUG_ON(offset >= PAGE_SIZE);
1474 if (PageHighMem(page))
1476 * This catches illegal uses and preserves the offset:
1478 bh->b_data = (char *)(0 + offset);
1479 else
1480 bh->b_data = page_address(page) + offset;
1482 EXPORT_SYMBOL(set_bh_page);
1485 * Called when truncating a buffer on a page completely.
1487 static void discard_buffer(struct buffer_head * bh)
1489 lock_buffer(bh);
1490 clear_buffer_dirty(bh);
1491 bh->b_bdev = NULL;
1492 clear_buffer_mapped(bh);
1493 clear_buffer_req(bh);
1494 clear_buffer_new(bh);
1495 clear_buffer_delay(bh);
1496 clear_buffer_unwritten(bh);
1497 unlock_buffer(bh);
1501 * block_invalidatepage - invalidate part of all of a buffer-backed page
1503 * @page: the page which is affected
1504 * @offset: the index of the truncation point
1506 * block_invalidatepage() is called when all or part of the page has become
1507 * invalidatedby a truncate operation.
1509 * block_invalidatepage() does not have to release all buffers, but it must
1510 * ensure that no dirty buffer is left outside @offset and that no I/O
1511 * is underway against any of the blocks which are outside the truncation
1512 * point. Because the caller is about to free (and possibly reuse) those
1513 * blocks on-disk.
1515 void block_invalidatepage(struct page *page, unsigned long offset)
1517 struct buffer_head *head, *bh, *next;
1518 unsigned int curr_off = 0;
1520 BUG_ON(!PageLocked(page));
1521 if (!page_has_buffers(page))
1522 goto out;
1524 head = page_buffers(page);
1525 bh = head;
1526 do {
1527 unsigned int next_off = curr_off + bh->b_size;
1528 next = bh->b_this_page;
1531 * is this block fully invalidated?
1533 if (offset <= curr_off)
1534 discard_buffer(bh);
1535 curr_off = next_off;
1536 bh = next;
1537 } while (bh != head);
1540 * We release buffers only if the entire page is being invalidated.
1541 * The get_block cached value has been unconditionally invalidated,
1542 * so real IO is not possible anymore.
1544 if (offset == 0)
1545 try_to_release_page(page, 0);
1546 out:
1547 return;
1549 EXPORT_SYMBOL(block_invalidatepage);
1552 * We attach and possibly dirty the buffers atomically wrt
1553 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1554 * is already excluded via the page lock.
1556 void create_empty_buffers(struct page *page,
1557 unsigned long blocksize, unsigned long b_state)
1559 struct buffer_head *bh, *head, *tail;
1561 head = alloc_page_buffers(page, blocksize, 1);
1562 bh = head;
1563 do {
1564 bh->b_state |= b_state;
1565 tail = bh;
1566 bh = bh->b_this_page;
1567 } while (bh);
1568 tail->b_this_page = head;
1570 spin_lock(&page->mapping->private_lock);
1571 if (PageUptodate(page) || PageDirty(page)) {
1572 bh = head;
1573 do {
1574 if (PageDirty(page))
1575 set_buffer_dirty(bh);
1576 if (PageUptodate(page))
1577 set_buffer_uptodate(bh);
1578 bh = bh->b_this_page;
1579 } while (bh != head);
1581 attach_page_buffers(page, head);
1582 spin_unlock(&page->mapping->private_lock);
1584 EXPORT_SYMBOL(create_empty_buffers);
1587 * We are taking a block for data and we don't want any output from any
1588 * buffer-cache aliases starting from return from that function and
1589 * until the moment when something will explicitly mark the buffer
1590 * dirty (hopefully that will not happen until we will free that block ;-)
1591 * We don't even need to mark it not-uptodate - nobody can expect
1592 * anything from a newly allocated buffer anyway. We used to used
1593 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1594 * don't want to mark the alias unmapped, for example - it would confuse
1595 * anyone who might pick it with bread() afterwards...
1597 * Also.. Note that bforget() doesn't lock the buffer. So there can
1598 * be writeout I/O going on against recently-freed buffers. We don't
1599 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1600 * only if we really need to. That happens here.
1602 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1604 struct buffer_head *old_bh;
1606 might_sleep();
1608 old_bh = __find_get_block_slow(bdev, block);
1609 if (old_bh) {
1610 clear_buffer_dirty(old_bh);
1611 wait_on_buffer(old_bh);
1612 clear_buffer_req(old_bh);
1613 __brelse(old_bh);
1616 EXPORT_SYMBOL(unmap_underlying_metadata);
1619 * NOTE! All mapped/uptodate combinations are valid:
1621 * Mapped Uptodate Meaning
1623 * No No "unknown" - must do get_block()
1624 * No Yes "hole" - zero-filled
1625 * Yes No "allocated" - allocated on disk, not read in
1626 * Yes Yes "valid" - allocated and up-to-date in memory.
1628 * "Dirty" is valid only with the last case (mapped+uptodate).
1632 * While block_write_full_page is writing back the dirty buffers under
1633 * the page lock, whoever dirtied the buffers may decide to clean them
1634 * again at any time. We handle that by only looking at the buffer
1635 * state inside lock_buffer().
1637 * If block_write_full_page() is called for regular writeback
1638 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1639 * locked buffer. This only can happen if someone has written the buffer
1640 * directly, with submit_bh(). At the address_space level PageWriteback
1641 * prevents this contention from occurring.
1643 static int __block_write_full_page(struct inode *inode, struct page *page,
1644 get_block_t *get_block, struct writeback_control *wbc)
1646 int err;
1647 sector_t block;
1648 sector_t last_block;
1649 struct buffer_head *bh, *head;
1650 const unsigned blocksize = 1 << inode->i_blkbits;
1651 int nr_underway = 0;
1653 BUG_ON(!PageLocked(page));
1655 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1657 if (!page_has_buffers(page)) {
1658 create_empty_buffers(page, blocksize,
1659 (1 << BH_Dirty)|(1 << BH_Uptodate));
1663 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1664 * here, and the (potentially unmapped) buffers may become dirty at
1665 * any time. If a buffer becomes dirty here after we've inspected it
1666 * then we just miss that fact, and the page stays dirty.
1668 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1669 * handle that here by just cleaning them.
1672 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1673 head = page_buffers(page);
1674 bh = head;
1677 * Get all the dirty buffers mapped to disk addresses and
1678 * handle any aliases from the underlying blockdev's mapping.
1680 do {
1681 if (block > last_block) {
1683 * mapped buffers outside i_size will occur, because
1684 * this page can be outside i_size when there is a
1685 * truncate in progress.
1688 * The buffer was zeroed by block_write_full_page()
1690 clear_buffer_dirty(bh);
1691 set_buffer_uptodate(bh);
1692 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1693 buffer_dirty(bh)) {
1694 WARN_ON(bh->b_size != blocksize);
1695 err = get_block(inode, block, bh, 1);
1696 if (err)
1697 goto recover;
1698 clear_buffer_delay(bh);
1699 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */
1701 clear_buffer_new(bh);
1702 unmap_underlying_metadata(bh->b_bdev,
1703 bh->b_blocknr);
1706 bh = bh->b_this_page;
1707 block++;
1708 } while (bh != head);
1710 do {
1711 if (!buffer_mapped(bh))
1712 continue;
1714 * If it's a fully non-blocking write attempt and we cannot
1715 * lock the buffer then redirty the page. Note that this can
1716 * potentially cause a busy-wait loop from pdflush and kswapd
1717 * activity, but those code paths have their own higher-level
1718 * throttling.
1720 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1721 lock_buffer(bh);
1722 } else if (!trylock_buffer(bh)) {
1723 redirty_page_for_writepage(wbc, page);
1724 continue;
1726 if (test_clear_buffer_dirty(bh)) {
1727 mark_buffer_async_write(bh);
1728 } else {
1729 unlock_buffer(bh);
1731 } while ((bh = bh->b_this_page) != head);
1734 * The page and its buffers are protected by PageWriteback(), so we can
1735 * drop the bh refcounts early.
1737 BUG_ON(PageWriteback(page));
1738 set_page_writeback(page);
1740 do {
1741 struct buffer_head *next = bh->b_this_page;
1742 if (buffer_async_write(bh)) {
1743 submit_bh(WRITE, bh);
1744 nr_underway++;
1746 bh = next;
1747 } while (bh != head);
1748 unlock_page(page);
1750 err = 0;
1751 done:
1752 if (nr_underway == 0) {
1754 * The page was marked dirty, but the buffers were
1755 * clean. Someone wrote them back by hand with
1756 * ll_rw_block/submit_bh. A rare case.
1758 end_page_writeback(page);
1761 * The page and buffer_heads can be released at any time from
1762 * here on.
1765 return err;
1767 recover:
1769 * ENOSPC, or some other error. We may already have added some
1770 * blocks to the file, so we need to write these out to avoid
1771 * exposing stale data.
1772 * The page is currently locked and not marked for writeback
1774 bh = head;
1775 /* Recovery: lock and submit the mapped buffers */
1776 do {
1777 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1778 !buffer_delay(bh)) {
1779 lock_buffer(bh);
1780 mark_buffer_async_write(bh);
1781 } else {
1783 * The buffer may have been set dirty during
1784 * attachment to a dirty page.
1786 clear_buffer_dirty(bh);
1788 } while ((bh = bh->b_this_page) != head);
1789 SetPageError(page);
1790 BUG_ON(PageWriteback(page));
1791 mapping_set_error(page->mapping, err);
1792 set_page_writeback(page);
1793 do {
1794 struct buffer_head *next = bh->b_this_page;
1795 if (buffer_async_write(bh)) {
1796 clear_buffer_dirty(bh);
1797 submit_bh(WRITE, bh);
1798 nr_underway++;
1800 bh = next;
1801 } while (bh != head);
1802 unlock_page(page);
1803 goto done;
1807 * If a page has any new buffers, zero them out here, and mark them uptodate
1808 * and dirty so they'll be written out (in order to prevent uninitialised
1809 * block data from leaking). And clear the new bit.
1811 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1813 unsigned int block_start, block_end;
1814 struct buffer_head *head, *bh;
1816 BUG_ON(!PageLocked(page));
1817 if (!page_has_buffers(page))
1818 return;
1820 bh = head = page_buffers(page);
1821 block_start = 0;
1822 do {
1823 block_end = block_start + bh->b_size;
1825 if (buffer_new(bh)) {
1826 if (block_end > from && block_start < to) {
1827 if (!PageUptodate(page)) {
1828 unsigned start, size;
1830 start = max(from, block_start);
1831 size = min(to, block_end) - start;
1833 zero_user(page, start, size);
1834 set_buffer_uptodate(bh);
1837 clear_buffer_new(bh);
1838 mark_buffer_dirty(bh);
1842 block_start = block_end;
1843 bh = bh->b_this_page;
1844 } while (bh != head);
1846 EXPORT_SYMBOL(page_zero_new_buffers);
1848 static int __block_prepare_write(struct inode *inode, struct page *page,
1849 unsigned from, unsigned to, get_block_t *get_block)
1851 unsigned block_start, block_end;
1852 sector_t block;
1853 int err = 0;
1854 unsigned blocksize, bbits;
1855 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1857 BUG_ON(!PageLocked(page));
1858 BUG_ON(from > PAGE_CACHE_SIZE);
1859 BUG_ON(to > PAGE_CACHE_SIZE);
1860 BUG_ON(from > to);
1862 blocksize = 1 << inode->i_blkbits;
1863 if (!page_has_buffers(page))
1864 create_empty_buffers(page, blocksize, 0);
1865 head = page_buffers(page);
1867 bbits = inode->i_blkbits;
1868 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1870 for(bh = head, block_start = 0; bh != head || !block_start;
1871 block++, block_start=block_end, bh = bh->b_this_page) {
1872 block_end = block_start + blocksize;
1873 if (block_end <= from || block_start >= to) {
1874 if (PageUptodate(page)) {
1875 if (!buffer_uptodate(bh))
1876 set_buffer_uptodate(bh);
1878 continue;
1880 if (buffer_new(bh))
1881 clear_buffer_new(bh);
1882 if (!buffer_mapped(bh)) {
1883 WARN_ON(bh->b_size != blocksize);
1884 err = get_block(inode, block, bh, 1);
1885 if (err)
1886 break;
1887 if (buffer_new(bh)) {
1888 unmap_underlying_metadata(bh->b_bdev,
1889 bh->b_blocknr);
1890 if (PageUptodate(page)) {
1891 clear_buffer_new(bh);
1892 set_buffer_uptodate(bh);
1893 mark_buffer_dirty(bh);
1894 continue;
1896 if (block_end > to || block_start < from)
1897 zero_user_segments(page,
1898 to, block_end,
1899 block_start, from);
1900 continue;
1903 if (PageUptodate(page)) {
1904 if (!buffer_uptodate(bh))
1905 set_buffer_uptodate(bh);
1906 continue;
1908 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1909 !buffer_unwritten(bh) &&
1910 (block_start < from || block_end > to)) {
1911 ll_rw_block(READ, 1, &bh);
1912 *wait_bh++=bh;
1916 * If we issued read requests - let them complete.
1918 while(wait_bh > wait) {
1919 wait_on_buffer(*--wait_bh);
1920 if (!buffer_uptodate(*wait_bh))
1921 err = -EIO;
1923 if (unlikely(err))
1924 page_zero_new_buffers(page, from, to);
1925 return err;
1928 static int __block_commit_write(struct inode *inode, struct page *page,
1929 unsigned from, unsigned to)
1931 unsigned block_start, block_end;
1932 int partial = 0;
1933 unsigned blocksize;
1934 struct buffer_head *bh, *head;
1936 blocksize = 1 << inode->i_blkbits;
1938 for(bh = head = page_buffers(page), block_start = 0;
1939 bh != head || !block_start;
1940 block_start=block_end, bh = bh->b_this_page) {
1941 block_end = block_start + blocksize;
1942 if (block_end <= from || block_start >= to) {
1943 if (!buffer_uptodate(bh))
1944 partial = 1;
1945 } else {
1946 set_buffer_uptodate(bh);
1947 mark_buffer_dirty(bh);
1949 clear_buffer_new(bh);
1953 * If this is a partial write which happened to make all buffers
1954 * uptodate then we can optimize away a bogus readpage() for
1955 * the next read(). Here we 'discover' whether the page went
1956 * uptodate as a result of this (potentially partial) write.
1958 if (!partial)
1959 SetPageUptodate(page);
1960 return 0;
1964 * block_write_begin takes care of the basic task of block allocation and
1965 * bringing partial write blocks uptodate first.
1967 * If *pagep is not NULL, then block_write_begin uses the locked page
1968 * at *pagep rather than allocating its own. In this case, the page will
1969 * not be unlocked or deallocated on failure.
1971 int block_write_begin(struct file *file, struct address_space *mapping,
1972 loff_t pos, unsigned len, unsigned flags,
1973 struct page **pagep, void **fsdata,
1974 get_block_t *get_block)
1976 struct inode *inode = mapping->host;
1977 int status = 0;
1978 struct page *page;
1979 pgoff_t index;
1980 unsigned start, end;
1981 int ownpage = 0;
1983 index = pos >> PAGE_CACHE_SHIFT;
1984 start = pos & (PAGE_CACHE_SIZE - 1);
1985 end = start + len;
1987 page = *pagep;
1988 if (page == NULL) {
1989 ownpage = 1;
1990 page = __grab_cache_page(mapping, index);
1991 if (!page) {
1992 status = -ENOMEM;
1993 goto out;
1995 *pagep = page;
1996 } else
1997 BUG_ON(!PageLocked(page));
1999 status = __block_prepare_write(inode, page, start, end, get_block);
2000 if (unlikely(status)) {
2001 ClearPageUptodate(page);
2003 if (ownpage) {
2004 unlock_page(page);
2005 page_cache_release(page);
2006 *pagep = NULL;
2009 * prepare_write() may have instantiated a few blocks
2010 * outside i_size. Trim these off again. Don't need
2011 * i_size_read because we hold i_mutex.
2013 if (pos + len > inode->i_size)
2014 vmtruncate(inode, inode->i_size);
2016 goto out;
2019 out:
2020 return status;
2022 EXPORT_SYMBOL(block_write_begin);
2024 int block_write_end(struct file *file, struct address_space *mapping,
2025 loff_t pos, unsigned len, unsigned copied,
2026 struct page *page, void *fsdata)
2028 struct inode *inode = mapping->host;
2029 unsigned start;
2031 start = pos & (PAGE_CACHE_SIZE - 1);
2033 if (unlikely(copied < len)) {
2035 * The buffers that were written will now be uptodate, so we
2036 * don't have to worry about a readpage reading them and
2037 * overwriting a partial write. However if we have encountered
2038 * a short write and only partially written into a buffer, it
2039 * will not be marked uptodate, so a readpage might come in and
2040 * destroy our partial write.
2042 * Do the simplest thing, and just treat any short write to a
2043 * non uptodate page as a zero-length write, and force the
2044 * caller to redo the whole thing.
2046 if (!PageUptodate(page))
2047 copied = 0;
2049 page_zero_new_buffers(page, start+copied, start+len);
2051 flush_dcache_page(page);
2053 /* This could be a short (even 0-length) commit */
2054 __block_commit_write(inode, page, start, start+copied);
2056 return copied;
2058 EXPORT_SYMBOL(block_write_end);
2060 int generic_write_end(struct file *file, struct address_space *mapping,
2061 loff_t pos, unsigned len, unsigned copied,
2062 struct page *page, void *fsdata)
2064 struct inode *inode = mapping->host;
2065 int i_size_changed = 0;
2067 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2070 * No need to use i_size_read() here, the i_size
2071 * cannot change under us because we hold i_mutex.
2073 * But it's important to update i_size while still holding page lock:
2074 * page writeout could otherwise come in and zero beyond i_size.
2076 if (pos+copied > inode->i_size) {
2077 i_size_write(inode, pos+copied);
2078 i_size_changed = 1;
2081 unlock_page(page);
2082 page_cache_release(page);
2085 * Don't mark the inode dirty under page lock. First, it unnecessarily
2086 * makes the holding time of page lock longer. Second, it forces lock
2087 * ordering of page lock and transaction start for journaling
2088 * filesystems.
2090 if (i_size_changed)
2091 mark_inode_dirty(inode);
2093 return copied;
2095 EXPORT_SYMBOL(generic_write_end);
2098 * block_is_partially_uptodate checks whether buffers within a page are
2099 * uptodate or not.
2101 * Returns true if all buffers which correspond to a file portion
2102 * we want to read are uptodate.
2104 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2105 unsigned long from)
2107 struct inode *inode = page->mapping->host;
2108 unsigned block_start, block_end, blocksize;
2109 unsigned to;
2110 struct buffer_head *bh, *head;
2111 int ret = 1;
2113 if (!page_has_buffers(page))
2114 return 0;
2116 blocksize = 1 << inode->i_blkbits;
2117 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2118 to = from + to;
2119 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2120 return 0;
2122 head = page_buffers(page);
2123 bh = head;
2124 block_start = 0;
2125 do {
2126 block_end = block_start + blocksize;
2127 if (block_end > from && block_start < to) {
2128 if (!buffer_uptodate(bh)) {
2129 ret = 0;
2130 break;
2132 if (block_end >= to)
2133 break;
2135 block_start = block_end;
2136 bh = bh->b_this_page;
2137 } while (bh != head);
2139 return ret;
2141 EXPORT_SYMBOL(block_is_partially_uptodate);
2144 * Generic "read page" function for block devices that have the normal
2145 * get_block functionality. This is most of the block device filesystems.
2146 * Reads the page asynchronously --- the unlock_buffer() and
2147 * set/clear_buffer_uptodate() functions propagate buffer state into the
2148 * page struct once IO has completed.
2150 int block_read_full_page(struct page *page, get_block_t *get_block)
2152 struct inode *inode = page->mapping->host;
2153 sector_t iblock, lblock;
2154 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2155 unsigned int blocksize;
2156 int nr, i;
2157 int fully_mapped = 1;
2159 BUG_ON(!PageLocked(page));
2160 blocksize = 1 << inode->i_blkbits;
2161 if (!page_has_buffers(page))
2162 create_empty_buffers(page, blocksize, 0);
2163 head = page_buffers(page);
2165 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2166 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2167 bh = head;
2168 nr = 0;
2169 i = 0;
2171 do {
2172 if (buffer_uptodate(bh))
2173 continue;
2175 if (!buffer_mapped(bh)) {
2176 int err = 0;
2178 fully_mapped = 0;
2179 if (iblock < lblock) {
2180 WARN_ON(bh->b_size != blocksize);
2181 err = get_block(inode, iblock, bh, 0);
2182 if (err)
2183 SetPageError(page);
2185 if (!buffer_mapped(bh)) {
2186 zero_user(page, i * blocksize, blocksize);
2187 if (!err)
2188 set_buffer_uptodate(bh);
2189 continue;
2192 * get_block() might have updated the buffer
2193 * synchronously
2195 if (buffer_uptodate(bh))
2196 continue;
2198 arr[nr++] = bh;
2199 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2201 if (fully_mapped)
2202 SetPageMappedToDisk(page);
2204 if (!nr) {
2206 * All buffers are uptodate - we can set the page uptodate
2207 * as well. But not if get_block() returned an error.
2209 if (!PageError(page))
2210 SetPageUptodate(page);
2211 unlock_page(page);
2212 return 0;
2215 /* Stage two: lock the buffers */
2216 for (i = 0; i < nr; i++) {
2217 bh = arr[i];
2218 lock_buffer(bh);
2219 mark_buffer_async_read(bh);
2223 * Stage 3: start the IO. Check for uptodateness
2224 * inside the buffer lock in case another process reading
2225 * the underlying blockdev brought it uptodate (the sct fix).
2227 for (i = 0; i < nr; i++) {
2228 bh = arr[i];
2229 if (buffer_uptodate(bh))
2230 end_buffer_async_read(bh, 1);
2231 else
2232 submit_bh(READ, bh);
2234 return 0;
2237 /* utility function for filesystems that need to do work on expanding
2238 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2239 * deal with the hole.
2241 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2243 struct address_space *mapping = inode->i_mapping;
2244 struct page *page;
2245 void *fsdata;
2246 unsigned long limit;
2247 int err;
2249 err = -EFBIG;
2250 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2251 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2252 send_sig(SIGXFSZ, current, 0);
2253 goto out;
2255 if (size > inode->i_sb->s_maxbytes)
2256 goto out;
2258 err = pagecache_write_begin(NULL, mapping, size, 0,
2259 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2260 &page, &fsdata);
2261 if (err)
2262 goto out;
2264 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2265 BUG_ON(err > 0);
2267 out:
2268 return err;
2271 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2272 loff_t pos, loff_t *bytes)
2274 struct inode *inode = mapping->host;
2275 unsigned blocksize = 1 << inode->i_blkbits;
2276 struct page *page;
2277 void *fsdata;
2278 pgoff_t index, curidx;
2279 loff_t curpos;
2280 unsigned zerofrom, offset, len;
2281 int err = 0;
2283 index = pos >> PAGE_CACHE_SHIFT;
2284 offset = pos & ~PAGE_CACHE_MASK;
2286 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2287 zerofrom = curpos & ~PAGE_CACHE_MASK;
2288 if (zerofrom & (blocksize-1)) {
2289 *bytes |= (blocksize-1);
2290 (*bytes)++;
2292 len = PAGE_CACHE_SIZE - zerofrom;
2294 err = pagecache_write_begin(file, mapping, curpos, len,
2295 AOP_FLAG_UNINTERRUPTIBLE,
2296 &page, &fsdata);
2297 if (err)
2298 goto out;
2299 zero_user(page, zerofrom, len);
2300 err = pagecache_write_end(file, mapping, curpos, len, len,
2301 page, fsdata);
2302 if (err < 0)
2303 goto out;
2304 BUG_ON(err != len);
2305 err = 0;
2307 balance_dirty_pages_ratelimited(mapping);
2310 /* page covers the boundary, find the boundary offset */
2311 if (index == curidx) {
2312 zerofrom = curpos & ~PAGE_CACHE_MASK;
2313 /* if we will expand the thing last block will be filled */
2314 if (offset <= zerofrom) {
2315 goto out;
2317 if (zerofrom & (blocksize-1)) {
2318 *bytes |= (blocksize-1);
2319 (*bytes)++;
2321 len = offset - zerofrom;
2323 err = pagecache_write_begin(file, mapping, curpos, len,
2324 AOP_FLAG_UNINTERRUPTIBLE,
2325 &page, &fsdata);
2326 if (err)
2327 goto out;
2328 zero_user(page, zerofrom, len);
2329 err = pagecache_write_end(file, mapping, curpos, len, len,
2330 page, fsdata);
2331 if (err < 0)
2332 goto out;
2333 BUG_ON(err != len);
2334 err = 0;
2336 out:
2337 return err;
2341 * For moronic filesystems that do not allow holes in file.
2342 * We may have to extend the file.
2344 int cont_write_begin(struct file *file, struct address_space *mapping,
2345 loff_t pos, unsigned len, unsigned flags,
2346 struct page **pagep, void **fsdata,
2347 get_block_t *get_block, loff_t *bytes)
2349 struct inode *inode = mapping->host;
2350 unsigned blocksize = 1 << inode->i_blkbits;
2351 unsigned zerofrom;
2352 int err;
2354 err = cont_expand_zero(file, mapping, pos, bytes);
2355 if (err)
2356 goto out;
2358 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2359 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2360 *bytes |= (blocksize-1);
2361 (*bytes)++;
2364 *pagep = NULL;
2365 err = block_write_begin(file, mapping, pos, len,
2366 flags, pagep, fsdata, get_block);
2367 out:
2368 return err;
2371 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2372 get_block_t *get_block)
2374 struct inode *inode = page->mapping->host;
2375 int err = __block_prepare_write(inode, page, from, to, get_block);
2376 if (err)
2377 ClearPageUptodate(page);
2378 return err;
2381 int block_commit_write(struct page *page, unsigned from, unsigned to)
2383 struct inode *inode = page->mapping->host;
2384 __block_commit_write(inode,page,from,to);
2385 return 0;
2389 * block_page_mkwrite() is not allowed to change the file size as it gets
2390 * called from a page fault handler when a page is first dirtied. Hence we must
2391 * be careful to check for EOF conditions here. We set the page up correctly
2392 * for a written page which means we get ENOSPC checking when writing into
2393 * holes and correct delalloc and unwritten extent mapping on filesystems that
2394 * support these features.
2396 * We are not allowed to take the i_mutex here so we have to play games to
2397 * protect against truncate races as the page could now be beyond EOF. Because
2398 * vmtruncate() writes the inode size before removing pages, once we have the
2399 * page lock we can determine safely if the page is beyond EOF. If it is not
2400 * beyond EOF, then the page is guaranteed safe against truncation until we
2401 * unlock the page.
2404 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2405 get_block_t get_block)
2407 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2408 unsigned long end;
2409 loff_t size;
2410 int ret = -EINVAL;
2412 lock_page(page);
2413 size = i_size_read(inode);
2414 if ((page->mapping != inode->i_mapping) ||
2415 (page_offset(page) > size)) {
2416 /* page got truncated out from underneath us */
2417 goto out_unlock;
2420 /* page is wholly or partially inside EOF */
2421 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422 end = size & ~PAGE_CACHE_MASK;
2423 else
2424 end = PAGE_CACHE_SIZE;
2426 ret = block_prepare_write(page, 0, end, get_block);
2427 if (!ret)
2428 ret = block_commit_write(page, 0, end);
2430 out_unlock:
2431 unlock_page(page);
2432 return ret;
2436 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2437 * immediately, while under the page lock. So it needs a special end_io
2438 * handler which does not touch the bh after unlocking it.
2440 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2442 __end_buffer_read_notouch(bh, uptodate);
2446 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2447 * the page (converting it to circular linked list and taking care of page
2448 * dirty races).
2450 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2452 struct buffer_head *bh;
2454 BUG_ON(!PageLocked(page));
2456 spin_lock(&page->mapping->private_lock);
2457 bh = head;
2458 do {
2459 if (PageDirty(page))
2460 set_buffer_dirty(bh);
2461 if (!bh->b_this_page)
2462 bh->b_this_page = head;
2463 bh = bh->b_this_page;
2464 } while (bh != head);
2465 attach_page_buffers(page, head);
2466 spin_unlock(&page->mapping->private_lock);
2470 * On entry, the page is fully not uptodate.
2471 * On exit the page is fully uptodate in the areas outside (from,to)
2473 int nobh_write_begin(struct file *file, struct address_space *mapping,
2474 loff_t pos, unsigned len, unsigned flags,
2475 struct page **pagep, void **fsdata,
2476 get_block_t *get_block)
2478 struct inode *inode = mapping->host;
2479 const unsigned blkbits = inode->i_blkbits;
2480 const unsigned blocksize = 1 << blkbits;
2481 struct buffer_head *head, *bh;
2482 struct page *page;
2483 pgoff_t index;
2484 unsigned from, to;
2485 unsigned block_in_page;
2486 unsigned block_start, block_end;
2487 sector_t block_in_file;
2488 int nr_reads = 0;
2489 int ret = 0;
2490 int is_mapped_to_disk = 1;
2492 index = pos >> PAGE_CACHE_SHIFT;
2493 from = pos & (PAGE_CACHE_SIZE - 1);
2494 to = from + len;
2496 page = __grab_cache_page(mapping, index);
2497 if (!page)
2498 return -ENOMEM;
2499 *pagep = page;
2500 *fsdata = NULL;
2502 if (page_has_buffers(page)) {
2503 unlock_page(page);
2504 page_cache_release(page);
2505 *pagep = NULL;
2506 return block_write_begin(file, mapping, pos, len, flags, pagep,
2507 fsdata, get_block);
2510 if (PageMappedToDisk(page))
2511 return 0;
2514 * Allocate buffers so that we can keep track of state, and potentially
2515 * attach them to the page if an error occurs. In the common case of
2516 * no error, they will just be freed again without ever being attached
2517 * to the page (which is all OK, because we're under the page lock).
2519 * Be careful: the buffer linked list is a NULL terminated one, rather
2520 * than the circular one we're used to.
2522 head = alloc_page_buffers(page, blocksize, 0);
2523 if (!head) {
2524 ret = -ENOMEM;
2525 goto out_release;
2528 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2531 * We loop across all blocks in the page, whether or not they are
2532 * part of the affected region. This is so we can discover if the
2533 * page is fully mapped-to-disk.
2535 for (block_start = 0, block_in_page = 0, bh = head;
2536 block_start < PAGE_CACHE_SIZE;
2537 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2538 int create;
2540 block_end = block_start + blocksize;
2541 bh->b_state = 0;
2542 create = 1;
2543 if (block_start >= to)
2544 create = 0;
2545 ret = get_block(inode, block_in_file + block_in_page,
2546 bh, create);
2547 if (ret)
2548 goto failed;
2549 if (!buffer_mapped(bh))
2550 is_mapped_to_disk = 0;
2551 if (buffer_new(bh))
2552 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2553 if (PageUptodate(page)) {
2554 set_buffer_uptodate(bh);
2555 continue;
2557 if (buffer_new(bh) || !buffer_mapped(bh)) {
2558 zero_user_segments(page, block_start, from,
2559 to, block_end);
2560 continue;
2562 if (buffer_uptodate(bh))
2563 continue; /* reiserfs does this */
2564 if (block_start < from || block_end > to) {
2565 lock_buffer(bh);
2566 bh->b_end_io = end_buffer_read_nobh;
2567 submit_bh(READ, bh);
2568 nr_reads++;
2572 if (nr_reads) {
2574 * The page is locked, so these buffers are protected from
2575 * any VM or truncate activity. Hence we don't need to care
2576 * for the buffer_head refcounts.
2578 for (bh = head; bh; bh = bh->b_this_page) {
2579 wait_on_buffer(bh);
2580 if (!buffer_uptodate(bh))
2581 ret = -EIO;
2583 if (ret)
2584 goto failed;
2587 if (is_mapped_to_disk)
2588 SetPageMappedToDisk(page);
2590 *fsdata = head; /* to be released by nobh_write_end */
2592 return 0;
2594 failed:
2595 BUG_ON(!ret);
2597 * Error recovery is a bit difficult. We need to zero out blocks that
2598 * were newly allocated, and dirty them to ensure they get written out.
2599 * Buffers need to be attached to the page at this point, otherwise
2600 * the handling of potential IO errors during writeout would be hard
2601 * (could try doing synchronous writeout, but what if that fails too?)
2603 attach_nobh_buffers(page, head);
2604 page_zero_new_buffers(page, from, to);
2606 out_release:
2607 unlock_page(page);
2608 page_cache_release(page);
2609 *pagep = NULL;
2611 if (pos + len > inode->i_size)
2612 vmtruncate(inode, inode->i_size);
2614 return ret;
2616 EXPORT_SYMBOL(nobh_write_begin);
2618 int nobh_write_end(struct file *file, struct address_space *mapping,
2619 loff_t pos, unsigned len, unsigned copied,
2620 struct page *page, void *fsdata)
2622 struct inode *inode = page->mapping->host;
2623 struct buffer_head *head = fsdata;
2624 struct buffer_head *bh;
2625 BUG_ON(fsdata != NULL && page_has_buffers(page));
2627 if (unlikely(copied < len) && !page_has_buffers(page))
2628 attach_nobh_buffers(page, head);
2629 if (page_has_buffers(page))
2630 return generic_write_end(file, mapping, pos, len,
2631 copied, page, fsdata);
2633 SetPageUptodate(page);
2634 set_page_dirty(page);
2635 if (pos+copied > inode->i_size) {
2636 i_size_write(inode, pos+copied);
2637 mark_inode_dirty(inode);
2640 unlock_page(page);
2641 page_cache_release(page);
2643 while (head) {
2644 bh = head;
2645 head = head->b_this_page;
2646 free_buffer_head(bh);
2649 return copied;
2651 EXPORT_SYMBOL(nobh_write_end);
2654 * nobh_writepage() - based on block_full_write_page() except
2655 * that it tries to operate without attaching bufferheads to
2656 * the page.
2658 int nobh_writepage(struct page *page, get_block_t *get_block,
2659 struct writeback_control *wbc)
2661 struct inode * const inode = page->mapping->host;
2662 loff_t i_size = i_size_read(inode);
2663 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2664 unsigned offset;
2665 int ret;
2667 /* Is the page fully inside i_size? */
2668 if (page->index < end_index)
2669 goto out;
2671 /* Is the page fully outside i_size? (truncate in progress) */
2672 offset = i_size & (PAGE_CACHE_SIZE-1);
2673 if (page->index >= end_index+1 || !offset) {
2675 * The page may have dirty, unmapped buffers. For example,
2676 * they may have been added in ext3_writepage(). Make them
2677 * freeable here, so the page does not leak.
2679 #if 0
2680 /* Not really sure about this - do we need this ? */
2681 if (page->mapping->a_ops->invalidatepage)
2682 page->mapping->a_ops->invalidatepage(page, offset);
2683 #endif
2684 unlock_page(page);
2685 return 0; /* don't care */
2689 * The page straddles i_size. It must be zeroed out on each and every
2690 * writepage invocation because it may be mmapped. "A file is mapped
2691 * in multiples of the page size. For a file that is not a multiple of
2692 * the page size, the remaining memory is zeroed when mapped, and
2693 * writes to that region are not written out to the file."
2695 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2696 out:
2697 ret = mpage_writepage(page, get_block, wbc);
2698 if (ret == -EAGAIN)
2699 ret = __block_write_full_page(inode, page, get_block, wbc);
2700 return ret;
2702 EXPORT_SYMBOL(nobh_writepage);
2704 int nobh_truncate_page(struct address_space *mapping,
2705 loff_t from, get_block_t *get_block)
2707 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2708 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2709 unsigned blocksize;
2710 sector_t iblock;
2711 unsigned length, pos;
2712 struct inode *inode = mapping->host;
2713 struct page *page;
2714 struct buffer_head map_bh;
2715 int err;
2717 blocksize = 1 << inode->i_blkbits;
2718 length = offset & (blocksize - 1);
2720 /* Block boundary? Nothing to do */
2721 if (!length)
2722 return 0;
2724 length = blocksize - length;
2725 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2727 page = grab_cache_page(mapping, index);
2728 err = -ENOMEM;
2729 if (!page)
2730 goto out;
2732 if (page_has_buffers(page)) {
2733 has_buffers:
2734 unlock_page(page);
2735 page_cache_release(page);
2736 return block_truncate_page(mapping, from, get_block);
2739 /* Find the buffer that contains "offset" */
2740 pos = blocksize;
2741 while (offset >= pos) {
2742 iblock++;
2743 pos += blocksize;
2746 err = get_block(inode, iblock, &map_bh, 0);
2747 if (err)
2748 goto unlock;
2749 /* unmapped? It's a hole - nothing to do */
2750 if (!buffer_mapped(&map_bh))
2751 goto unlock;
2753 /* Ok, it's mapped. Make sure it's up-to-date */
2754 if (!PageUptodate(page)) {
2755 err = mapping->a_ops->readpage(NULL, page);
2756 if (err) {
2757 page_cache_release(page);
2758 goto out;
2760 lock_page(page);
2761 if (!PageUptodate(page)) {
2762 err = -EIO;
2763 goto unlock;
2765 if (page_has_buffers(page))
2766 goto has_buffers;
2768 zero_user(page, offset, length);
2769 set_page_dirty(page);
2770 err = 0;
2772 unlock:
2773 unlock_page(page);
2774 page_cache_release(page);
2775 out:
2776 return err;
2778 EXPORT_SYMBOL(nobh_truncate_page);
2780 int block_truncate_page(struct address_space *mapping,
2781 loff_t from, get_block_t *get_block)
2783 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2784 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2785 unsigned blocksize;
2786 sector_t iblock;
2787 unsigned length, pos;
2788 struct inode *inode = mapping->host;
2789 struct page *page;
2790 struct buffer_head *bh;
2791 int err;
2793 blocksize = 1 << inode->i_blkbits;
2794 length = offset & (blocksize - 1);
2796 /* Block boundary? Nothing to do */
2797 if (!length)
2798 return 0;
2800 length = blocksize - length;
2801 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2803 page = grab_cache_page(mapping, index);
2804 err = -ENOMEM;
2805 if (!page)
2806 goto out;
2808 if (!page_has_buffers(page))
2809 create_empty_buffers(page, blocksize, 0);
2811 /* Find the buffer that contains "offset" */
2812 bh = page_buffers(page);
2813 pos = blocksize;
2814 while (offset >= pos) {
2815 bh = bh->b_this_page;
2816 iblock++;
2817 pos += blocksize;
2820 err = 0;
2821 if (!buffer_mapped(bh)) {
2822 WARN_ON(bh->b_size != blocksize);
2823 err = get_block(inode, iblock, bh, 0);
2824 if (err)
2825 goto unlock;
2826 /* unmapped? It's a hole - nothing to do */
2827 if (!buffer_mapped(bh))
2828 goto unlock;
2831 /* Ok, it's mapped. Make sure it's up-to-date */
2832 if (PageUptodate(page))
2833 set_buffer_uptodate(bh);
2835 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2836 err = -EIO;
2837 ll_rw_block(READ, 1, &bh);
2838 wait_on_buffer(bh);
2839 /* Uhhuh. Read error. Complain and punt. */
2840 if (!buffer_uptodate(bh))
2841 goto unlock;
2844 zero_user(page, offset, length);
2845 mark_buffer_dirty(bh);
2846 err = 0;
2848 unlock:
2849 unlock_page(page);
2850 page_cache_release(page);
2851 out:
2852 return err;
2856 * The generic ->writepage function for buffer-backed address_spaces
2858 int block_write_full_page(struct page *page, get_block_t *get_block,
2859 struct writeback_control *wbc)
2861 struct inode * const inode = page->mapping->host;
2862 loff_t i_size = i_size_read(inode);
2863 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2864 unsigned offset;
2866 /* Is the page fully inside i_size? */
2867 if (page->index < end_index)
2868 return __block_write_full_page(inode, page, get_block, wbc);
2870 /* Is the page fully outside i_size? (truncate in progress) */
2871 offset = i_size & (PAGE_CACHE_SIZE-1);
2872 if (page->index >= end_index+1 || !offset) {
2874 * The page may have dirty, unmapped buffers. For example,
2875 * they may have been added in ext3_writepage(). Make them
2876 * freeable here, so the page does not leak.
2878 do_invalidatepage(page, 0);
2879 unlock_page(page);
2880 return 0; /* don't care */
2884 * The page straddles i_size. It must be zeroed out on each and every
2885 * writepage invokation because it may be mmapped. "A file is mapped
2886 * in multiples of the page size. For a file that is not a multiple of
2887 * the page size, the remaining memory is zeroed when mapped, and
2888 * writes to that region are not written out to the file."
2890 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2891 return __block_write_full_page(inode, page, get_block, wbc);
2894 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2895 get_block_t *get_block)
2897 struct buffer_head tmp;
2898 struct inode *inode = mapping->host;
2899 tmp.b_state = 0;
2900 tmp.b_blocknr = 0;
2901 tmp.b_size = 1 << inode->i_blkbits;
2902 get_block(inode, block, &tmp, 0);
2903 return tmp.b_blocknr;
2906 static void end_bio_bh_io_sync(struct bio *bio, int err)
2908 struct buffer_head *bh = bio->bi_private;
2910 if (err == -EOPNOTSUPP) {
2911 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2912 set_bit(BH_Eopnotsupp, &bh->b_state);
2915 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2916 bio_put(bio);
2919 int submit_bh(int rw, struct buffer_head * bh)
2921 struct bio *bio;
2922 int ret = 0;
2924 BUG_ON(!buffer_locked(bh));
2925 BUG_ON(!buffer_mapped(bh));
2926 BUG_ON(!bh->b_end_io);
2929 * Mask in barrier bit for a write (could be either a WRITE or a
2930 * WRITE_SYNC
2932 if (buffer_ordered(bh) && (rw & WRITE))
2933 rw |= WRITE_BARRIER;
2936 * Only clear out a write error when rewriting
2938 if (test_set_buffer_req(bh) && (rw & WRITE))
2939 clear_buffer_write_io_error(bh);
2942 * from here on down, it's all bio -- do the initial mapping,
2943 * submit_bio -> generic_make_request may further map this bio around
2945 bio = bio_alloc(GFP_NOIO, 1);
2947 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2948 bio->bi_bdev = bh->b_bdev;
2949 bio->bi_io_vec[0].bv_page = bh->b_page;
2950 bio->bi_io_vec[0].bv_len = bh->b_size;
2951 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2953 bio->bi_vcnt = 1;
2954 bio->bi_idx = 0;
2955 bio->bi_size = bh->b_size;
2957 bio->bi_end_io = end_bio_bh_io_sync;
2958 bio->bi_private = bh;
2960 bio_get(bio);
2961 submit_bio(rw, bio);
2963 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2964 ret = -EOPNOTSUPP;
2966 bio_put(bio);
2967 return ret;
2971 * ll_rw_block: low-level access to block devices (DEPRECATED)
2972 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2973 * @nr: number of &struct buffer_heads in the array
2974 * @bhs: array of pointers to &struct buffer_head
2976 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2977 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2978 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2979 * are sent to disk. The fourth %READA option is described in the documentation
2980 * for generic_make_request() which ll_rw_block() calls.
2982 * This function drops any buffer that it cannot get a lock on (with the
2983 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2984 * clean when doing a write request, and any buffer that appears to be
2985 * up-to-date when doing read request. Further it marks as clean buffers that
2986 * are processed for writing (the buffer cache won't assume that they are
2987 * actually clean until the buffer gets unlocked).
2989 * ll_rw_block sets b_end_io to simple completion handler that marks
2990 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2991 * any waiters.
2993 * All of the buffers must be for the same device, and must also be a
2994 * multiple of the current approved size for the device.
2996 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2998 int i;
3000 for (i = 0; i < nr; i++) {
3001 struct buffer_head *bh = bhs[i];
3003 if (rw == SWRITE || rw == SWRITE_SYNC)
3004 lock_buffer(bh);
3005 else if (!trylock_buffer(bh))
3006 continue;
3008 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
3009 if (test_clear_buffer_dirty(bh)) {
3010 bh->b_end_io = end_buffer_write_sync;
3011 get_bh(bh);
3012 if (rw == SWRITE_SYNC)
3013 submit_bh(WRITE_SYNC, bh);
3014 else
3015 submit_bh(WRITE, bh);
3016 continue;
3018 } else {
3019 if (!buffer_uptodate(bh)) {
3020 bh->b_end_io = end_buffer_read_sync;
3021 get_bh(bh);
3022 submit_bh(rw, bh);
3023 continue;
3026 unlock_buffer(bh);
3031 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3032 * and then start new I/O and then wait upon it. The caller must have a ref on
3033 * the buffer_head.
3035 int sync_dirty_buffer(struct buffer_head *bh)
3037 int ret = 0;
3039 WARN_ON(atomic_read(&bh->b_count) < 1);
3040 lock_buffer(bh);
3041 if (test_clear_buffer_dirty(bh)) {
3042 get_bh(bh);
3043 bh->b_end_io = end_buffer_write_sync;
3044 ret = submit_bh(WRITE_SYNC, bh);
3045 wait_on_buffer(bh);
3046 if (buffer_eopnotsupp(bh)) {
3047 clear_buffer_eopnotsupp(bh);
3048 ret = -EOPNOTSUPP;
3050 if (!ret && !buffer_uptodate(bh))
3051 ret = -EIO;
3052 } else {
3053 unlock_buffer(bh);
3055 return ret;
3059 * try_to_free_buffers() checks if all the buffers on this particular page
3060 * are unused, and releases them if so.
3062 * Exclusion against try_to_free_buffers may be obtained by either
3063 * locking the page or by holding its mapping's private_lock.
3065 * If the page is dirty but all the buffers are clean then we need to
3066 * be sure to mark the page clean as well. This is because the page
3067 * may be against a block device, and a later reattachment of buffers
3068 * to a dirty page will set *all* buffers dirty. Which would corrupt
3069 * filesystem data on the same device.
3071 * The same applies to regular filesystem pages: if all the buffers are
3072 * clean then we set the page clean and proceed. To do that, we require
3073 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3074 * private_lock.
3076 * try_to_free_buffers() is non-blocking.
3078 static inline int buffer_busy(struct buffer_head *bh)
3080 return atomic_read(&bh->b_count) |
3081 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3084 static int
3085 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3087 struct buffer_head *head = page_buffers(page);
3088 struct buffer_head *bh;
3090 bh = head;
3091 do {
3092 if (buffer_write_io_error(bh) && page->mapping)
3093 set_bit(AS_EIO, &page->mapping->flags);
3094 if (buffer_busy(bh))
3095 goto failed;
3096 bh = bh->b_this_page;
3097 } while (bh != head);
3099 do {
3100 struct buffer_head *next = bh->b_this_page;
3102 if (bh->b_assoc_map)
3103 __remove_assoc_queue(bh);
3104 bh = next;
3105 } while (bh != head);
3106 *buffers_to_free = head;
3107 __clear_page_buffers(page);
3108 return 1;
3109 failed:
3110 return 0;
3113 int try_to_free_buffers(struct page *page)
3115 struct address_space * const mapping = page->mapping;
3116 struct buffer_head *buffers_to_free = NULL;
3117 int ret = 0;
3119 BUG_ON(!PageLocked(page));
3120 if (PageWriteback(page))
3121 return 0;
3123 if (mapping == NULL) { /* can this still happen? */
3124 ret = drop_buffers(page, &buffers_to_free);
3125 goto out;
3128 spin_lock(&mapping->private_lock);
3129 ret = drop_buffers(page, &buffers_to_free);
3132 * If the filesystem writes its buffers by hand (eg ext3)
3133 * then we can have clean buffers against a dirty page. We
3134 * clean the page here; otherwise the VM will never notice
3135 * that the filesystem did any IO at all.
3137 * Also, during truncate, discard_buffer will have marked all
3138 * the page's buffers clean. We discover that here and clean
3139 * the page also.
3141 * private_lock must be held over this entire operation in order
3142 * to synchronise against __set_page_dirty_buffers and prevent the
3143 * dirty bit from being lost.
3145 if (ret)
3146 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3147 spin_unlock(&mapping->private_lock);
3148 out:
3149 if (buffers_to_free) {
3150 struct buffer_head *bh = buffers_to_free;
3152 do {
3153 struct buffer_head *next = bh->b_this_page;
3154 free_buffer_head(bh);
3155 bh = next;
3156 } while (bh != buffers_to_free);
3158 return ret;
3160 EXPORT_SYMBOL(try_to_free_buffers);
3162 void block_sync_page(struct page *page)
3164 struct address_space *mapping;
3166 smp_mb();
3167 mapping = page_mapping(page);
3168 if (mapping)
3169 blk_run_backing_dev(mapping->backing_dev_info, page);
3173 * There are no bdflush tunables left. But distributions are
3174 * still running obsolete flush daemons, so we terminate them here.
3176 * Use of bdflush() is deprecated and will be removed in a future kernel.
3177 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3179 asmlinkage long sys_bdflush(int func, long data)
3181 static int msg_count;
3183 if (!capable(CAP_SYS_ADMIN))
3184 return -EPERM;
3186 if (msg_count < 5) {
3187 msg_count++;
3188 printk(KERN_INFO
3189 "warning: process `%s' used the obsolete bdflush"
3190 " system call\n", current->comm);
3191 printk(KERN_INFO "Fix your initscripts?\n");
3194 if (func == 1)
3195 do_exit(0);
3196 return 0;
3200 * Buffer-head allocation
3202 static struct kmem_cache *bh_cachep;
3205 * Once the number of bh's in the machine exceeds this level, we start
3206 * stripping them in writeback.
3208 static int max_buffer_heads;
3210 int buffer_heads_over_limit;
3212 struct bh_accounting {
3213 int nr; /* Number of live bh's */
3214 int ratelimit; /* Limit cacheline bouncing */
3217 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3219 static void recalc_bh_state(void)
3221 int i;
3222 int tot = 0;
3224 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3225 return;
3226 __get_cpu_var(bh_accounting).ratelimit = 0;
3227 for_each_online_cpu(i)
3228 tot += per_cpu(bh_accounting, i).nr;
3229 buffer_heads_over_limit = (tot > max_buffer_heads);
3232 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3234 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3235 if (ret) {
3236 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3237 get_cpu_var(bh_accounting).nr++;
3238 recalc_bh_state();
3239 put_cpu_var(bh_accounting);
3241 return ret;
3243 EXPORT_SYMBOL(alloc_buffer_head);
3245 void free_buffer_head(struct buffer_head *bh)
3247 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3248 kmem_cache_free(bh_cachep, bh);
3249 get_cpu_var(bh_accounting).nr--;
3250 recalc_bh_state();
3251 put_cpu_var(bh_accounting);
3253 EXPORT_SYMBOL(free_buffer_head);
3255 static void buffer_exit_cpu(int cpu)
3257 int i;
3258 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3260 for (i = 0; i < BH_LRU_SIZE; i++) {
3261 brelse(b->bhs[i]);
3262 b->bhs[i] = NULL;
3264 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3265 per_cpu(bh_accounting, cpu).nr = 0;
3266 put_cpu_var(bh_accounting);
3269 static int buffer_cpu_notify(struct notifier_block *self,
3270 unsigned long action, void *hcpu)
3272 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3273 buffer_exit_cpu((unsigned long)hcpu);
3274 return NOTIFY_OK;
3278 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3279 * @bh: struct buffer_head
3281 * Return true if the buffer is up-to-date and false,
3282 * with the buffer locked, if not.
3284 int bh_uptodate_or_lock(struct buffer_head *bh)
3286 if (!buffer_uptodate(bh)) {
3287 lock_buffer(bh);
3288 if (!buffer_uptodate(bh))
3289 return 0;
3290 unlock_buffer(bh);
3292 return 1;
3294 EXPORT_SYMBOL(bh_uptodate_or_lock);
3297 * bh_submit_read - Submit a locked buffer for reading
3298 * @bh: struct buffer_head
3300 * Returns zero on success and -EIO on error.
3302 int bh_submit_read(struct buffer_head *bh)
3304 BUG_ON(!buffer_locked(bh));
3306 if (buffer_uptodate(bh)) {
3307 unlock_buffer(bh);
3308 return 0;
3311 get_bh(bh);
3312 bh->b_end_io = end_buffer_read_sync;
3313 submit_bh(READ, bh);
3314 wait_on_buffer(bh);
3315 if (buffer_uptodate(bh))
3316 return 0;
3317 return -EIO;
3319 EXPORT_SYMBOL(bh_submit_read);
3321 static void
3322 init_buffer_head(void *data)
3324 struct buffer_head *bh = data;
3326 memset(bh, 0, sizeof(*bh));
3327 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3330 void __init buffer_init(void)
3332 int nrpages;
3334 bh_cachep = kmem_cache_create("buffer_head",
3335 sizeof(struct buffer_head), 0,
3336 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3337 SLAB_MEM_SPREAD),
3338 init_buffer_head);
3341 * Limit the bh occupancy to 10% of ZONE_NORMAL
3343 nrpages = (nr_free_buffer_pages() * 10) / 100;
3344 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3345 hotcpu_notifier(buffer_cpu_notify, 0);
3348 EXPORT_SYMBOL(__bforget);
3349 EXPORT_SYMBOL(__brelse);
3350 EXPORT_SYMBOL(__wait_on_buffer);
3351 EXPORT_SYMBOL(block_commit_write);
3352 EXPORT_SYMBOL(block_prepare_write);
3353 EXPORT_SYMBOL(block_page_mkwrite);
3354 EXPORT_SYMBOL(block_read_full_page);
3355 EXPORT_SYMBOL(block_sync_page);
3356 EXPORT_SYMBOL(block_truncate_page);
3357 EXPORT_SYMBOL(block_write_full_page);
3358 EXPORT_SYMBOL(cont_write_begin);
3359 EXPORT_SYMBOL(end_buffer_read_sync);
3360 EXPORT_SYMBOL(end_buffer_write_sync);
3361 EXPORT_SYMBOL(file_fsync);
3362 EXPORT_SYMBOL(fsync_bdev);
3363 EXPORT_SYMBOL(generic_block_bmap);
3364 EXPORT_SYMBOL(generic_cont_expand_simple);
3365 EXPORT_SYMBOL(init_buffer);
3366 EXPORT_SYMBOL(invalidate_bdev);
3367 EXPORT_SYMBOL(ll_rw_block);
3368 EXPORT_SYMBOL(mark_buffer_dirty);
3369 EXPORT_SYMBOL(submit_bh);
3370 EXPORT_SYMBOL(sync_dirty_buffer);
3371 EXPORT_SYMBOL(unlock_buffer);