MOXA linux-2.6.x / linux-2.6.9-uc0 from sdlinux-moxaart.tgz
[linux-2.6.9-moxart.git] / fs / buffer.c
blobec44b5d5c687d98fc3366c1eec39397d3326fd89
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
42 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
43 static void invalidate_bh_lrus(void);
45 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
47 struct bh_wait_queue {
48 struct buffer_head *bh;
49 wait_queue_t wait;
52 #define __DEFINE_BH_WAIT(name, b, f) \
53 struct bh_wait_queue name = { \
54 .bh = b, \
55 .wait = { \
56 .task = current, \
57 .flags = f, \
58 .func = bh_wake_function, \
59 .task_list = \
60 LIST_HEAD_INIT(name.wait.task_list),\
61 }, \
63 #define DEFINE_BH_WAIT(name, bh) __DEFINE_BH_WAIT(name, bh, 0)
64 #define DEFINE_BH_WAIT_EXCLUSIVE(name, bh) \
65 __DEFINE_BH_WAIT(name, bh, WQ_FLAG_EXCLUSIVE)
68 * Hashed waitqueue_head's for wait_on_buffer()
70 #define BH_WAIT_TABLE_ORDER 7
71 static struct bh_wait_queue_head {
72 wait_queue_head_t wqh;
73 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
75 inline void
76 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
78 bh->b_end_io = handler;
79 bh->b_private = private;
83 * Return the address of the waitqueue_head to be used for this
84 * buffer_head
86 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
88 return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
90 EXPORT_SYMBOL(bh_waitq_head);
92 void wake_up_buffer(struct buffer_head *bh)
94 wait_queue_head_t *wq = bh_waitq_head(bh);
96 smp_mb();
97 if (waitqueue_active(wq))
98 __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, bh);
100 EXPORT_SYMBOL(wake_up_buffer);
102 static int bh_wake_function(wait_queue_t *wait, unsigned mode,
103 int sync, void *key)
105 struct buffer_head *bh = key;
106 struct bh_wait_queue *wq;
108 wq = container_of(wait, struct bh_wait_queue, wait);
109 if (wq->bh != bh || buffer_locked(bh))
110 return 0;
111 else
112 return autoremove_wake_function(wait, mode, sync, key);
115 static void sync_buffer(struct buffer_head *bh)
117 struct block_device *bd;
119 smp_mb();
120 bd = bh->b_bdev;
121 if (bd)
122 blk_run_address_space(bd->bd_inode->i_mapping);
125 void fastcall __lock_buffer(struct buffer_head *bh)
127 wait_queue_head_t *wqh = bh_waitq_head(bh);
128 DEFINE_BH_WAIT_EXCLUSIVE(wait, bh);
130 do {
131 prepare_to_wait_exclusive(wqh, &wait.wait,
132 TASK_UNINTERRUPTIBLE);
133 if (buffer_locked(bh)) {
134 sync_buffer(bh);
135 io_schedule();
137 } while (test_set_buffer_locked(bh));
138 finish_wait(wqh, &wait.wait);
140 EXPORT_SYMBOL(__lock_buffer);
142 void fastcall unlock_buffer(struct buffer_head *bh)
144 clear_buffer_locked(bh);
145 smp_mb__after_clear_bit();
146 wake_up_buffer(bh);
150 * Block until a buffer comes unlocked. This doesn't stop it
151 * from becoming locked again - you have to lock it yourself
152 * if you want to preserve its state.
154 void __wait_on_buffer(struct buffer_head * bh)
156 wait_queue_head_t *wqh = bh_waitq_head(bh);
157 DEFINE_BH_WAIT(wait, bh);
159 do {
160 prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
161 if (buffer_locked(bh)) {
162 sync_buffer(bh);
163 io_schedule();
165 } while (buffer_locked(bh));
166 finish_wait(wqh, &wait.wait);
169 static void
170 __set_page_buffers(struct page *page, struct buffer_head *head)
172 page_cache_get(page);
173 SetPagePrivate(page);
174 page->private = (unsigned long)head;
177 static void
178 __clear_page_buffers(struct page *page)
180 ClearPagePrivate(page);
181 page->private = 0;
182 page_cache_release(page);
185 static void buffer_io_error(struct buffer_head *bh)
187 char b[BDEVNAME_SIZE];
189 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
190 bdevname(bh->b_bdev, b),
191 (unsigned long long)bh->b_blocknr);
195 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
196 * unlock the buffer. This is what ll_rw_block uses too.
198 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
200 if (uptodate) {
201 set_buffer_uptodate(bh);
202 } else {
203 /* This happens, due to failed READA attempts. */
204 clear_buffer_uptodate(bh);
206 unlock_buffer(bh);
207 put_bh(bh);
210 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
212 char b[BDEVNAME_SIZE];
214 if (uptodate) {
215 set_buffer_uptodate(bh);
216 } else {
217 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
218 buffer_io_error(bh);
219 printk(KERN_WARNING "lost page write due to "
220 "I/O error on %s\n",
221 bdevname(bh->b_bdev, b));
223 set_buffer_write_io_error(bh);
224 clear_buffer_uptodate(bh);
226 unlock_buffer(bh);
227 put_bh(bh);
231 * Write out and wait upon all the dirty data associated with a block
232 * device via its mapping. Does not take the superblock lock.
234 int sync_blockdev(struct block_device *bdev)
236 int ret = 0;
238 if (bdev) {
239 int err;
241 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
242 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
243 if (!ret)
244 ret = err;
246 return ret;
248 EXPORT_SYMBOL(sync_blockdev);
251 * Write out and wait upon all dirty data associated with this
252 * superblock. Filesystem data as well as the underlying block
253 * device. Takes the superblock lock.
255 int fsync_super(struct super_block *sb)
257 sync_inodes_sb(sb, 0);
258 DQUOT_SYNC(sb);
259 lock_super(sb);
260 if (sb->s_dirt && sb->s_op->write_super)
261 sb->s_op->write_super(sb);
262 unlock_super(sb);
263 if (sb->s_op->sync_fs)
264 sb->s_op->sync_fs(sb, 1);
265 sync_blockdev(sb->s_bdev);
266 sync_inodes_sb(sb, 1);
268 return sync_blockdev(sb->s_bdev);
272 * Write out and wait upon all dirty data associated with this
273 * device. Filesystem data as well as the underlying block
274 * device. Takes the superblock lock.
276 int fsync_bdev(struct block_device *bdev)
278 struct super_block *sb = get_super(bdev);
279 if (sb) {
280 int res = fsync_super(sb);
281 drop_super(sb);
282 return res;
284 return sync_blockdev(bdev);
288 * freeze_bdev -- lock a filesystem and force it into a consistent state
289 * @bdev: blockdevice to lock
291 * This takes the block device bd_mount_sem to make sure no new mounts
292 * happen on bdev until thaw_bdev() is called.
293 * If a superblock is found on this device, we take the s_umount semaphore
294 * on it to make sure nobody unmounts until the snapshot creation is done.
296 struct super_block *freeze_bdev(struct block_device *bdev)
298 struct super_block *sb;
300 down(&bdev->bd_mount_sem);
301 sb = get_super(bdev);
302 if (sb && !(sb->s_flags & MS_RDONLY)) {
303 sb->s_frozen = SB_FREEZE_WRITE;
304 wmb();
306 sync_inodes_sb(sb, 0);
307 DQUOT_SYNC(sb);
309 lock_super(sb);
310 if (sb->s_dirt && sb->s_op->write_super)
311 sb->s_op->write_super(sb);
312 unlock_super(sb);
314 if (sb->s_op->sync_fs)
315 sb->s_op->sync_fs(sb, 1);
317 sync_blockdev(sb->s_bdev);
318 sync_inodes_sb(sb, 1);
320 sb->s_frozen = SB_FREEZE_TRANS;
321 wmb();
323 sync_blockdev(sb->s_bdev);
325 if (sb->s_op->write_super_lockfs)
326 sb->s_op->write_super_lockfs(sb);
329 sync_blockdev(bdev);
330 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
332 EXPORT_SYMBOL(freeze_bdev);
335 * thaw_bdev -- unlock filesystem
336 * @bdev: blockdevice to unlock
337 * @sb: associated superblock
339 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
341 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
343 if (sb) {
344 BUG_ON(sb->s_bdev != bdev);
346 if (sb->s_op->unlockfs)
347 sb->s_op->unlockfs(sb);
348 sb->s_frozen = SB_UNFROZEN;
349 wmb();
350 wake_up(&sb->s_wait_unfrozen);
351 drop_super(sb);
354 up(&bdev->bd_mount_sem);
356 EXPORT_SYMBOL(thaw_bdev);
359 * sync everything. Start out by waking pdflush, because that writes back
360 * all queues in parallel.
362 static void do_sync(unsigned long wait)
364 wakeup_bdflush(0);
365 sync_inodes(0); /* All mappings, inodes and their blockdevs */
366 DQUOT_SYNC(NULL);
367 sync_supers(); /* Write the superblocks */
368 sync_filesystems(0); /* Start syncing the filesystems */
369 sync_filesystems(wait); /* Waitingly sync the filesystems */
370 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
371 if (!wait)
372 printk("Emergency Sync complete\n");
373 if (unlikely(laptop_mode))
374 laptop_sync_completion();
377 asmlinkage long sys_sync(void)
379 do_sync(1);
380 return 0;
383 void emergency_sync(void)
385 pdflush_operation(do_sync, 0);
389 * Generic function to fsync a file.
391 * filp may be NULL if called via the msync of a vma.
394 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
396 struct inode * inode = dentry->d_inode;
397 struct super_block * sb;
398 int ret;
400 /* sync the inode to buffers */
401 write_inode_now(inode, 0);
403 /* sync the superblock to buffers */
404 sb = inode->i_sb;
405 lock_super(sb);
406 if (sb->s_op->write_super)
407 sb->s_op->write_super(sb);
408 unlock_super(sb);
410 /* .. finally sync the buffers to disk */
411 ret = sync_blockdev(sb->s_bdev);
412 return ret;
415 asmlinkage long sys_fsync(unsigned int fd)
417 struct file * file;
418 struct address_space *mapping;
419 int ret, err;
421 ret = -EBADF;
422 file = fget(fd);
423 if (!file)
424 goto out;
426 mapping = file->f_mapping;
428 ret = -EINVAL;
429 if (!file->f_op || !file->f_op->fsync) {
430 /* Why? We can still call filemap_fdatawrite */
431 goto out_putf;
434 /* We need to protect against concurrent writers.. */
435 down(&mapping->host->i_sem);
436 current->flags |= PF_SYNCWRITE;
437 ret = filemap_fdatawrite(mapping);
438 err = file->f_op->fsync(file, file->f_dentry, 0);
439 if (!ret)
440 ret = err;
441 err = filemap_fdatawait(mapping);
442 if (!ret)
443 ret = err;
444 current->flags &= ~PF_SYNCWRITE;
445 up(&mapping->host->i_sem);
447 out_putf:
448 fput(file);
449 out:
450 return ret;
453 asmlinkage long sys_fdatasync(unsigned int fd)
455 struct file * file;
456 struct address_space *mapping;
457 int ret, err;
459 ret = -EBADF;
460 file = fget(fd);
461 if (!file)
462 goto out;
464 ret = -EINVAL;
465 if (!file->f_op || !file->f_op->fsync)
466 goto out_putf;
468 mapping = file->f_mapping;
470 down(&mapping->host->i_sem);
471 current->flags |= PF_SYNCWRITE;
472 ret = filemap_fdatawrite(mapping);
473 err = file->f_op->fsync(file, file->f_dentry, 1);
474 if (!ret)
475 ret = err;
476 err = filemap_fdatawait(mapping);
477 if (!ret)
478 ret = err;
479 current->flags &= ~PF_SYNCWRITE;
480 up(&mapping->host->i_sem);
482 out_putf:
483 fput(file);
484 out:
485 return ret;
489 * Various filesystems appear to want __find_get_block to be non-blocking.
490 * But it's the page lock which protects the buffers. To get around this,
491 * we get exclusion from try_to_free_buffers with the blockdev mapping's
492 * private_lock.
494 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
495 * may be quite high. This code could TryLock the page, and if that
496 * succeeds, there is no need to take private_lock. (But if
497 * private_lock is contended then so is mapping->tree_lock).
499 static struct buffer_head *
500 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
502 struct inode *bd_inode = bdev->bd_inode;
503 struct address_space *bd_mapping = bd_inode->i_mapping;
504 struct buffer_head *ret = NULL;
505 pgoff_t index;
506 struct buffer_head *bh;
507 struct buffer_head *head;
508 struct page *page;
510 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
511 page = find_get_page(bd_mapping, index);
512 if (!page)
513 goto out;
515 spin_lock(&bd_mapping->private_lock);
516 if (!page_has_buffers(page))
517 goto out_unlock;
518 head = page_buffers(page);
519 bh = head;
520 do {
521 if (bh->b_blocknr == block) {
522 ret = bh;
523 get_bh(bh);
524 goto out_unlock;
526 bh = bh->b_this_page;
527 } while (bh != head);
529 printk("__find_get_block_slow() failed. "
530 "block=%llu, b_blocknr=%llu\n",
531 (unsigned long long)block, (unsigned long long)bh->b_blocknr);
532 printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
533 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
534 out_unlock:
535 spin_unlock(&bd_mapping->private_lock);
536 page_cache_release(page);
537 out:
538 return ret;
541 /* If invalidate_buffers() will trash dirty buffers, it means some kind
542 of fs corruption is going on. Trashing dirty data always imply losing
543 information that was supposed to be just stored on the physical layer
544 by the user.
546 Thus invalidate_buffers in general usage is not allwowed to trash
547 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
548 be preserved. These buffers are simply skipped.
550 We also skip buffers which are still in use. For example this can
551 happen if a userspace program is reading the block device.
553 NOTE: In the case where the user removed a removable-media-disk even if
554 there's still dirty data not synced on disk (due a bug in the device driver
555 or due an error of the user), by not destroying the dirty buffers we could
556 generate corruption also on the next media inserted, thus a parameter is
557 necessary to handle this case in the most safe way possible (trying
558 to not corrupt also the new disk inserted with the data belonging to
559 the old now corrupted disk). Also for the ramdisk the natural thing
560 to do in order to release the ramdisk memory is to destroy dirty buffers.
562 These are two special cases. Normal usage imply the device driver
563 to issue a sync on the device (without waiting I/O completion) and
564 then an invalidate_buffers call that doesn't trash dirty buffers.
566 For handling cache coherency with the blkdev pagecache the 'update' case
567 is been introduced. It is needed to re-read from disk any pinned
568 buffer. NOTE: re-reading from disk is destructive so we can do it only
569 when we assume nobody is changing the buffercache under our I/O and when
570 we think the disk contains more recent information than the buffercache.
571 The update == 1 pass marks the buffers we need to update, the update == 2
572 pass does the actual I/O. */
573 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
575 invalidate_bh_lrus();
577 * FIXME: what about destroy_dirty_buffers?
578 * We really want to use invalidate_inode_pages2() for
579 * that, but not until that's cleaned up.
581 invalidate_inode_pages(bdev->bd_inode->i_mapping);
585 * Kick pdflush then try to free up some ZONE_NORMAL memory.
587 static void free_more_memory(void)
589 struct zone **zones;
590 pg_data_t *pgdat;
592 wakeup_bdflush(1024);
593 yield();
595 for_each_pgdat(pgdat) {
596 zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
597 if (*zones)
598 try_to_free_pages(zones, GFP_NOFS, 0);
603 * I/O completion handler for block_read_full_page() - pages
604 * which come unlocked at the end of I/O.
606 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
608 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
609 unsigned long flags;
610 struct buffer_head *tmp;
611 struct page *page;
612 int page_uptodate = 1;
614 BUG_ON(!buffer_async_read(bh));
616 page = bh->b_page;
617 if (uptodate) {
618 set_buffer_uptodate(bh);
619 } else {
620 clear_buffer_uptodate(bh);
621 buffer_io_error(bh);
622 SetPageError(page);
626 * Be _very_ careful from here on. Bad things can happen if
627 * two buffer heads end IO at almost the same time and both
628 * decide that the page is now completely done.
630 spin_lock_irqsave(&page_uptodate_lock, flags);
631 clear_buffer_async_read(bh);
632 unlock_buffer(bh);
633 tmp = bh;
634 do {
635 if (!buffer_uptodate(tmp))
636 page_uptodate = 0;
637 if (buffer_async_read(tmp)) {
638 BUG_ON(!buffer_locked(tmp));
639 goto still_busy;
641 tmp = tmp->b_this_page;
642 } while (tmp != bh);
643 spin_unlock_irqrestore(&page_uptodate_lock, flags);
646 * If none of the buffers had errors and they are all
647 * uptodate then we can set the page uptodate.
649 if (page_uptodate && !PageError(page))
650 SetPageUptodate(page);
651 unlock_page(page);
652 return;
654 still_busy:
655 spin_unlock_irqrestore(&page_uptodate_lock, flags);
656 return;
660 * Completion handler for block_write_full_page() - pages which are unlocked
661 * during I/O, and which have PageWriteback cleared upon I/O completion.
663 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
665 char b[BDEVNAME_SIZE];
666 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
667 unsigned long flags;
668 struct buffer_head *tmp;
669 struct page *page;
671 BUG_ON(!buffer_async_write(bh));
673 page = bh->b_page;
674 if (uptodate) {
675 set_buffer_uptodate(bh);
676 } else {
677 if (printk_ratelimit()) {
678 buffer_io_error(bh);
679 printk(KERN_WARNING "lost page write due to "
680 "I/O error on %s\n",
681 bdevname(bh->b_bdev, b));
683 set_bit(AS_EIO, &page->mapping->flags);
684 clear_buffer_uptodate(bh);
685 SetPageError(page);
688 spin_lock_irqsave(&page_uptodate_lock, flags);
689 clear_buffer_async_write(bh);
690 unlock_buffer(bh);
691 tmp = bh->b_this_page;
692 while (tmp != bh) {
693 if (buffer_async_write(tmp)) {
694 BUG_ON(!buffer_locked(tmp));
695 goto still_busy;
697 tmp = tmp->b_this_page;
699 spin_unlock_irqrestore(&page_uptodate_lock, flags);
700 end_page_writeback(page);
701 return;
703 still_busy:
704 spin_unlock_irqrestore(&page_uptodate_lock, flags);
705 return;
709 * If a page's buffers are under async readin (end_buffer_async_read
710 * completion) then there is a possibility that another thread of
711 * control could lock one of the buffers after it has completed
712 * but while some of the other buffers have not completed. This
713 * locked buffer would confuse end_buffer_async_read() into not unlocking
714 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
715 * that this buffer is not under async I/O.
717 * The page comes unlocked when it has no locked buffer_async buffers
718 * left.
720 * PageLocked prevents anyone starting new async I/O reads any of
721 * the buffers.
723 * PageWriteback is used to prevent simultaneous writeout of the same
724 * page.
726 * PageLocked prevents anyone from starting writeback of a page which is
727 * under read I/O (PageWriteback is only ever set against a locked page).
729 static void mark_buffer_async_read(struct buffer_head *bh)
731 bh->b_end_io = end_buffer_async_read;
732 set_buffer_async_read(bh);
735 void mark_buffer_async_write(struct buffer_head *bh)
737 bh->b_end_io = end_buffer_async_write;
738 set_buffer_async_write(bh);
740 EXPORT_SYMBOL(mark_buffer_async_write);
744 * fs/buffer.c contains helper functions for buffer-backed address space's
745 * fsync functions. A common requirement for buffer-based filesystems is
746 * that certain data from the backing blockdev needs to be written out for
747 * a successful fsync(). For example, ext2 indirect blocks need to be
748 * written back and waited upon before fsync() returns.
750 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
751 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
752 * management of a list of dependent buffers at ->i_mapping->private_list.
754 * Locking is a little subtle: try_to_free_buffers() will remove buffers
755 * from their controlling inode's queue when they are being freed. But
756 * try_to_free_buffers() will be operating against the *blockdev* mapping
757 * at the time, not against the S_ISREG file which depends on those buffers.
758 * So the locking for private_list is via the private_lock in the address_space
759 * which backs the buffers. Which is different from the address_space
760 * against which the buffers are listed. So for a particular address_space,
761 * mapping->private_lock does *not* protect mapping->private_list! In fact,
762 * mapping->private_list will always be protected by the backing blockdev's
763 * ->private_lock.
765 * Which introduces a requirement: all buffers on an address_space's
766 * ->private_list must be from the same address_space: the blockdev's.
768 * address_spaces which do not place buffers at ->private_list via these
769 * utility functions are free to use private_lock and private_list for
770 * whatever they want. The only requirement is that list_empty(private_list)
771 * be true at clear_inode() time.
773 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
774 * filesystems should do that. invalidate_inode_buffers() should just go
775 * BUG_ON(!list_empty).
777 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
778 * take an address_space, not an inode. And it should be called
779 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
780 * queued up.
782 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
783 * list if it is already on a list. Because if the buffer is on a list,
784 * it *must* already be on the right one. If not, the filesystem is being
785 * silly. This will save a ton of locking. But first we have to ensure
786 * that buffers are taken *off* the old inode's list when they are freed
787 * (presumably in truncate). That requires careful auditing of all
788 * filesystems (do it inside bforget()). It could also be done by bringing
789 * b_inode back.
793 * The buffer's backing address_space's private_lock must be held
795 static inline void __remove_assoc_queue(struct buffer_head *bh)
797 list_del_init(&bh->b_assoc_buffers);
800 int inode_has_buffers(struct inode *inode)
802 return !list_empty(&inode->i_data.private_list);
806 * osync is designed to support O_SYNC io. It waits synchronously for
807 * all already-submitted IO to complete, but does not queue any new
808 * writes to the disk.
810 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
811 * you dirty the buffers, and then use osync_inode_buffers to wait for
812 * completion. Any other dirty buffers which are not yet queued for
813 * write will not be flushed to disk by the osync.
815 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
817 struct buffer_head *bh;
818 struct list_head *p;
819 int err = 0;
821 spin_lock(lock);
822 repeat:
823 list_for_each_prev(p, list) {
824 bh = BH_ENTRY(p);
825 if (buffer_locked(bh)) {
826 get_bh(bh);
827 spin_unlock(lock);
828 wait_on_buffer(bh);
829 if (!buffer_uptodate(bh))
830 err = -EIO;
831 brelse(bh);
832 spin_lock(lock);
833 goto repeat;
836 spin_unlock(lock);
837 return err;
841 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
842 * buffers
843 * @buffer_mapping - the mapping which backs the buffers' data
844 * @mapping - the mapping which wants those buffers written
846 * Starts I/O against the buffers at mapping->private_list, and waits upon
847 * that I/O.
849 * Basically, this is a convenience function for fsync(). @buffer_mapping is
850 * the blockdev which "owns" the buffers and @mapping is a file or directory
851 * which needs those buffers to be written for a successful fsync().
853 int sync_mapping_buffers(struct address_space *mapping)
855 struct address_space *buffer_mapping = mapping->assoc_mapping;
857 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
858 return 0;
860 return fsync_buffers_list(&buffer_mapping->private_lock,
861 &mapping->private_list);
863 EXPORT_SYMBOL(sync_mapping_buffers);
866 * Called when we've recently written block `bblock', and it is known that
867 * `bblock' was for a buffer_boundary() buffer. This means that the block at
868 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
869 * dirty, schedule it for IO. So that indirects merge nicely with their data.
871 void write_boundary_block(struct block_device *bdev,
872 sector_t bblock, unsigned blocksize)
874 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
875 if (bh) {
876 if (buffer_dirty(bh))
877 ll_rw_block(WRITE, 1, &bh);
878 put_bh(bh);
882 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
884 struct address_space *mapping = inode->i_mapping;
885 struct address_space *buffer_mapping = bh->b_page->mapping;
887 mark_buffer_dirty(bh);
888 if (!mapping->assoc_mapping) {
889 mapping->assoc_mapping = buffer_mapping;
890 } else {
891 if (mapping->assoc_mapping != buffer_mapping)
892 BUG();
894 if (list_empty(&bh->b_assoc_buffers)) {
895 spin_lock(&buffer_mapping->private_lock);
896 list_move_tail(&bh->b_assoc_buffers,
897 &mapping->private_list);
898 spin_unlock(&buffer_mapping->private_lock);
901 EXPORT_SYMBOL(mark_buffer_dirty_inode);
904 * Add a page to the dirty page list.
906 * It is a sad fact of life that this function is called from several places
907 * deeply under spinlocking. It may not sleep.
909 * If the page has buffers, the uptodate buffers are set dirty, to preserve
910 * dirty-state coherency between the page and the buffers. It the page does
911 * not have buffers then when they are later attached they will all be set
912 * dirty.
914 * The buffers are dirtied before the page is dirtied. There's a small race
915 * window in which a writepage caller may see the page cleanness but not the
916 * buffer dirtiness. That's fine. If this code were to set the page dirty
917 * before the buffers, a concurrent writepage caller could clear the page dirty
918 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
919 * page on the dirty page list.
921 * We use private_lock to lock against try_to_free_buffers while using the
922 * page's buffer list. Also use this to protect against clean buffers being
923 * added to the page after it was set dirty.
925 * FIXME: may need to call ->reservepage here as well. That's rather up to the
926 * address_space though.
928 int __set_page_dirty_buffers(struct page *page)
930 struct address_space * const mapping = page->mapping;
932 spin_lock(&mapping->private_lock);
933 if (page_has_buffers(page)) {
934 struct buffer_head *head = page_buffers(page);
935 struct buffer_head *bh = head;
937 do {
938 set_buffer_dirty(bh);
939 bh = bh->b_this_page;
940 } while (bh != head);
942 spin_unlock(&mapping->private_lock);
944 if (!TestSetPageDirty(page)) {
945 spin_lock_irq(&mapping->tree_lock);
946 if (page->mapping) { /* Race with truncate? */
947 if (!mapping->backing_dev_info->memory_backed)
948 inc_page_state(nr_dirty);
949 radix_tree_tag_set(&mapping->page_tree,
950 page_index(page),
951 PAGECACHE_TAG_DIRTY);
953 spin_unlock_irq(&mapping->tree_lock);
954 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
957 return 0;
959 EXPORT_SYMBOL(__set_page_dirty_buffers);
962 * Write out and wait upon a list of buffers.
964 * We have conflicting pressures: we want to make sure that all
965 * initially dirty buffers get waited on, but that any subsequently
966 * dirtied buffers don't. After all, we don't want fsync to last
967 * forever if somebody is actively writing to the file.
969 * Do this in two main stages: first we copy dirty buffers to a
970 * temporary inode list, queueing the writes as we go. Then we clean
971 * up, waiting for those writes to complete.
973 * During this second stage, any subsequent updates to the file may end
974 * up refiling the buffer on the original inode's dirty list again, so
975 * there is a chance we will end up with a buffer queued for write but
976 * not yet completed on that list. So, as a final cleanup we go through
977 * the osync code to catch these locked, dirty buffers without requeuing
978 * any newly dirty buffers for write.
980 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
982 struct buffer_head *bh;
983 struct list_head tmp;
984 int err = 0, err2;
986 INIT_LIST_HEAD(&tmp);
988 spin_lock(lock);
989 while (!list_empty(list)) {
990 bh = BH_ENTRY(list->next);
991 list_del_init(&bh->b_assoc_buffers);
992 if (buffer_dirty(bh) || buffer_locked(bh)) {
993 list_add(&bh->b_assoc_buffers, &tmp);
994 if (buffer_dirty(bh)) {
995 get_bh(bh);
996 spin_unlock(lock);
998 * Ensure any pending I/O completes so that
999 * ll_rw_block() actually writes the current
1000 * contents - it is a noop if I/O is still in
1001 * flight on potentially older contents.
1003 wait_on_buffer(bh);
1004 ll_rw_block(WRITE, 1, &bh);
1005 brelse(bh);
1006 spin_lock(lock);
1011 while (!list_empty(&tmp)) {
1012 bh = BH_ENTRY(tmp.prev);
1013 __remove_assoc_queue(bh);
1014 get_bh(bh);
1015 spin_unlock(lock);
1016 wait_on_buffer(bh);
1017 if (!buffer_uptodate(bh))
1018 err = -EIO;
1019 brelse(bh);
1020 spin_lock(lock);
1023 spin_unlock(lock);
1024 err2 = osync_buffers_list(lock, list);
1025 if (err)
1026 return err;
1027 else
1028 return err2;
1032 * Invalidate any and all dirty buffers on a given inode. We are
1033 * probably unmounting the fs, but that doesn't mean we have already
1034 * done a sync(). Just drop the buffers from the inode list.
1036 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
1037 * assumes that all the buffers are against the blockdev. Not true
1038 * for reiserfs.
1040 void invalidate_inode_buffers(struct inode *inode)
1042 if (inode_has_buffers(inode)) {
1043 struct address_space *mapping = &inode->i_data;
1044 struct list_head *list = &mapping->private_list;
1045 struct address_space *buffer_mapping = mapping->assoc_mapping;
1047 spin_lock(&buffer_mapping->private_lock);
1048 while (!list_empty(list))
1049 __remove_assoc_queue(BH_ENTRY(list->next));
1050 spin_unlock(&buffer_mapping->private_lock);
1055 * Remove any clean buffers from the inode's buffer list. This is called
1056 * when we're trying to free the inode itself. Those buffers can pin it.
1058 * Returns true if all buffers were removed.
1060 int remove_inode_buffers(struct inode *inode)
1062 int ret = 1;
1064 if (inode_has_buffers(inode)) {
1065 struct address_space *mapping = &inode->i_data;
1066 struct list_head *list = &mapping->private_list;
1067 struct address_space *buffer_mapping = mapping->assoc_mapping;
1069 spin_lock(&buffer_mapping->private_lock);
1070 while (!list_empty(list)) {
1071 struct buffer_head *bh = BH_ENTRY(list->next);
1072 if (buffer_dirty(bh)) {
1073 ret = 0;
1074 break;
1076 __remove_assoc_queue(bh);
1078 spin_unlock(&buffer_mapping->private_lock);
1080 return ret;
1084 * Create the appropriate buffers when given a page for data area and
1085 * the size of each buffer.. Use the bh->b_this_page linked list to
1086 * follow the buffers created. Return NULL if unable to create more
1087 * buffers.
1089 * The retry flag is used to differentiate async IO (paging, swapping)
1090 * which may not fail from ordinary buffer allocations.
1092 static struct buffer_head *
1093 create_buffers(struct page * page, unsigned long size, int retry)
1095 struct buffer_head *bh, *head;
1096 long offset;
1098 try_again:
1099 head = NULL;
1100 offset = PAGE_SIZE;
1101 while ((offset -= size) >= 0) {
1102 bh = alloc_buffer_head(GFP_NOFS);
1103 if (!bh)
1104 goto no_grow;
1106 bh->b_bdev = NULL;
1107 bh->b_this_page = head;
1108 bh->b_blocknr = -1;
1109 head = bh;
1111 bh->b_state = 0;
1112 atomic_set(&bh->b_count, 0);
1113 bh->b_size = size;
1115 /* Link the buffer to its page */
1116 set_bh_page(bh, page, offset);
1118 bh->b_end_io = NULL;
1120 return head;
1122 * In case anything failed, we just free everything we got.
1124 no_grow:
1125 if (head) {
1126 do {
1127 bh = head;
1128 head = head->b_this_page;
1129 free_buffer_head(bh);
1130 } while (head);
1134 * Return failure for non-async IO requests. Async IO requests
1135 * are not allowed to fail, so we have to wait until buffer heads
1136 * become available. But we don't want tasks sleeping with
1137 * partially complete buffers, so all were released above.
1139 if (!retry)
1140 return NULL;
1142 /* We're _really_ low on memory. Now we just
1143 * wait for old buffer heads to become free due to
1144 * finishing IO. Since this is an async request and
1145 * the reserve list is empty, we're sure there are
1146 * async buffer heads in use.
1148 free_more_memory();
1149 goto try_again;
1152 static inline void
1153 link_dev_buffers(struct page *page, struct buffer_head *head)
1155 struct buffer_head *bh, *tail;
1157 bh = head;
1158 do {
1159 tail = bh;
1160 bh = bh->b_this_page;
1161 } while (bh);
1162 tail->b_this_page = head;
1163 __set_page_buffers(page, head);
1167 * Initialise the state of a blockdev page's buffers.
1169 static void
1170 init_page_buffers(struct page *page, struct block_device *bdev,
1171 sector_t block, int size)
1173 struct buffer_head *head = page_buffers(page);
1174 struct buffer_head *bh = head;
1175 unsigned int b_state;
1177 b_state = 1 << BH_Mapped;
1178 if (PageUptodate(page))
1179 b_state |= 1 << BH_Uptodate;
1181 do {
1182 if (!(bh->b_state & (1 << BH_Mapped))) {
1183 init_buffer(bh, NULL, NULL);
1184 bh->b_bdev = bdev;
1185 bh->b_blocknr = block;
1186 bh->b_state = b_state;
1188 block++;
1189 bh = bh->b_this_page;
1190 } while (bh != head);
1194 * Create the page-cache page that contains the requested block.
1196 * This is user purely for blockdev mappings.
1198 static struct page *
1199 grow_dev_page(struct block_device *bdev, sector_t block,
1200 pgoff_t index, int size)
1202 struct inode *inode = bdev->bd_inode;
1203 struct page *page;
1204 struct buffer_head *bh;
1206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1207 if (!page)
1208 return NULL;
1210 if (!PageLocked(page))
1211 BUG();
1213 if (page_has_buffers(page)) {
1214 bh = page_buffers(page);
1215 if (bh->b_size == size)
1216 return page;
1217 if (!try_to_free_buffers(page))
1218 goto failed;
1222 * Allocate some buffers for this page
1224 bh = create_buffers(page, size, 0);
1225 if (!bh)
1226 goto failed;
1229 * Link the page to the buffers and initialise them. Take the
1230 * lock to be atomic wrt __find_get_block(), which does not
1231 * run under the page lock.
1233 spin_lock(&inode->i_mapping->private_lock);
1234 link_dev_buffers(page, bh);
1235 init_page_buffers(page, bdev, block, size);
1236 spin_unlock(&inode->i_mapping->private_lock);
1237 return page;
1239 failed:
1240 BUG();
1241 unlock_page(page);
1242 page_cache_release(page);
1243 return NULL;
1247 * Create buffers for the specified block device block's page. If
1248 * that page was dirty, the buffers are set dirty also.
1250 * Except that's a bug. Attaching dirty buffers to a dirty
1251 * blockdev's page can result in filesystem corruption, because
1252 * some of those buffers may be aliases of filesystem data.
1253 * grow_dev_page() will go BUG() if this happens.
1255 static inline int
1256 grow_buffers(struct block_device *bdev, sector_t block, int size)
1258 struct page *page;
1259 pgoff_t index;
1260 int sizebits;
1262 sizebits = -1;
1263 do {
1264 sizebits++;
1265 } while ((size << sizebits) < PAGE_SIZE);
1267 index = block >> sizebits;
1268 block = index << sizebits;
1270 /* Create a page with the proper size buffers.. */
1271 page = grow_dev_page(bdev, block, index, size);
1272 if (!page)
1273 return 0;
1274 unlock_page(page);
1275 page_cache_release(page);
1276 return 1;
1279 struct buffer_head *
1280 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1282 /* Size must be multiple of hard sectorsize */
1283 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1284 (size < 512 || size > PAGE_SIZE))) {
1285 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1286 size);
1287 printk(KERN_ERR "hardsect size: %d\n",
1288 bdev_hardsect_size(bdev));
1290 dump_stack();
1291 return NULL;
1294 for (;;) {
1295 struct buffer_head * bh;
1297 bh = __find_get_block(bdev, block, size);
1298 if (bh)
1299 return bh;
1301 if (!grow_buffers(bdev, block, size))
1302 free_more_memory();
1307 * The relationship between dirty buffers and dirty pages:
1309 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1310 * the page is tagged dirty in its radix tree.
1312 * At all times, the dirtiness of the buffers represents the dirtiness of
1313 * subsections of the page. If the page has buffers, the page dirty bit is
1314 * merely a hint about the true dirty state.
1316 * When a page is set dirty in its entirety, all its buffers are marked dirty
1317 * (if the page has buffers).
1319 * When a buffer is marked dirty, its page is dirtied, but the page's other
1320 * buffers are not.
1322 * Also. When blockdev buffers are explicitly read with bread(), they
1323 * individually become uptodate. But their backing page remains not
1324 * uptodate - even if all of its buffers are uptodate. A subsequent
1325 * block_read_full_page() against that page will discover all the uptodate
1326 * buffers, will set the page uptodate and will perform no I/O.
1330 * mark_buffer_dirty - mark a buffer_head as needing writeout
1332 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1333 * backing page dirty, then tag the page as dirty in its address_space's radix
1334 * tree and then attach the address_space's inode to its superblock's dirty
1335 * inode list.
1337 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1338 * mapping->tree_lock and the global inode_lock.
1340 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1342 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1343 __set_page_dirty_nobuffers(bh->b_page);
1347 * Decrement a buffer_head's reference count. If all buffers against a page
1348 * have zero reference count, are clean and unlocked, and if the page is clean
1349 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1350 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1351 * a page but it ends up not being freed, and buffers may later be reattached).
1353 void __brelse(struct buffer_head * buf)
1355 if (atomic_read(&buf->b_count)) {
1356 put_bh(buf);
1357 return;
1359 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1360 WARN_ON(1);
1364 * bforget() is like brelse(), except it discards any
1365 * potentially dirty data.
1367 void __bforget(struct buffer_head *bh)
1369 clear_buffer_dirty(bh);
1370 if (!list_empty(&bh->b_assoc_buffers)) {
1371 struct address_space *buffer_mapping = bh->b_page->mapping;
1373 spin_lock(&buffer_mapping->private_lock);
1374 list_del_init(&bh->b_assoc_buffers);
1375 spin_unlock(&buffer_mapping->private_lock);
1377 __brelse(bh);
1380 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1382 lock_buffer(bh);
1383 if (buffer_uptodate(bh)) {
1384 unlock_buffer(bh);
1385 return bh;
1386 } else {
1387 get_bh(bh);
1388 bh->b_end_io = end_buffer_read_sync;
1389 submit_bh(READ, bh);
1390 wait_on_buffer(bh);
1391 if (buffer_uptodate(bh))
1392 return bh;
1394 brelse(bh);
1395 return NULL;
1399 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1400 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1401 * refcount elevated by one when they're in an LRU. A buffer can only appear
1402 * once in a particular CPU's LRU. A single buffer can be present in multiple
1403 * CPU's LRUs at the same time.
1405 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1406 * sb_find_get_block().
1408 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1409 * a local interrupt disable for that.
1412 #define BH_LRU_SIZE 8
1414 struct bh_lru {
1415 struct buffer_head *bhs[BH_LRU_SIZE];
1418 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1420 #ifdef CONFIG_SMP
1421 #define bh_lru_lock() local_irq_disable()
1422 #define bh_lru_unlock() local_irq_enable()
1423 #else
1424 #define bh_lru_lock() preempt_disable()
1425 #define bh_lru_unlock() preempt_enable()
1426 #endif
1428 static inline void check_irqs_on(void)
1430 #ifdef irqs_disabled
1431 BUG_ON(irqs_disabled());
1432 #endif
1436 * The LRU management algorithm is dopey-but-simple. Sorry.
1438 static void bh_lru_install(struct buffer_head *bh)
1440 struct buffer_head *evictee = NULL;
1441 struct bh_lru *lru;
1443 check_irqs_on();
1444 bh_lru_lock();
1445 lru = &__get_cpu_var(bh_lrus);
1446 if (lru->bhs[0] != bh) {
1447 struct buffer_head *bhs[BH_LRU_SIZE];
1448 int in;
1449 int out = 0;
1451 get_bh(bh);
1452 bhs[out++] = bh;
1453 for (in = 0; in < BH_LRU_SIZE; in++) {
1454 struct buffer_head *bh2 = lru->bhs[in];
1456 if (bh2 == bh) {
1457 __brelse(bh2);
1458 } else {
1459 if (out >= BH_LRU_SIZE) {
1460 BUG_ON(evictee != NULL);
1461 evictee = bh2;
1462 } else {
1463 bhs[out++] = bh2;
1467 while (out < BH_LRU_SIZE)
1468 bhs[out++] = NULL;
1469 memcpy(lru->bhs, bhs, sizeof(bhs));
1471 bh_lru_unlock();
1473 if (evictee)
1474 __brelse(evictee);
1478 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1480 static inline struct buffer_head *
1481 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1483 struct buffer_head *ret = NULL;
1484 struct bh_lru *lru;
1485 int i;
1487 check_irqs_on();
1488 bh_lru_lock();
1489 lru = &__get_cpu_var(bh_lrus);
1490 for (i = 0; i < BH_LRU_SIZE; i++) {
1491 struct buffer_head *bh = lru->bhs[i];
1493 if (bh && bh->b_bdev == bdev &&
1494 bh->b_blocknr == block && bh->b_size == size) {
1495 if (i) {
1496 while (i) {
1497 lru->bhs[i] = lru->bhs[i - 1];
1498 i--;
1500 lru->bhs[0] = bh;
1502 get_bh(bh);
1503 ret = bh;
1504 break;
1507 bh_lru_unlock();
1508 return ret;
1512 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1513 * it in the LRU and mark it as accessed. If it is not present then return
1514 * NULL
1516 struct buffer_head *
1517 __find_get_block(struct block_device *bdev, sector_t block, int size)
1519 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1521 if (bh == NULL) {
1522 bh = __find_get_block_slow(bdev, block, size);
1523 if (bh)
1524 bh_lru_install(bh);
1526 if (bh)
1527 touch_buffer(bh);
1528 return bh;
1530 EXPORT_SYMBOL(__find_get_block);
1533 * __getblk will locate (and, if necessary, create) the buffer_head
1534 * which corresponds to the passed block_device, block and size. The
1535 * returned buffer has its reference count incremented.
1537 * __getblk() cannot fail - it just keeps trying. If you pass it an
1538 * illegal block number, __getblk() will happily return a buffer_head
1539 * which represents the non-existent block. Very weird.
1541 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1542 * attempt is failing. FIXME, perhaps?
1544 struct buffer_head *
1545 __getblk(struct block_device *bdev, sector_t block, int size)
1547 struct buffer_head *bh = __find_get_block(bdev, block, size);
1549 might_sleep();
1550 if (bh == NULL)
1551 bh = __getblk_slow(bdev, block, size);
1552 return bh;
1554 EXPORT_SYMBOL(__getblk);
1557 * Do async read-ahead on a buffer..
1559 void __breadahead(struct block_device *bdev, sector_t block, int size)
1561 struct buffer_head *bh = __getblk(bdev, block, size);
1562 ll_rw_block(READA, 1, &bh);
1563 brelse(bh);
1565 EXPORT_SYMBOL(__breadahead);
1568 * __bread() - reads a specified block and returns the bh
1569 * @block: number of block
1570 * @size: size (in bytes) to read
1572 * Reads a specified block, and returns buffer head that contains it.
1573 * It returns NULL if the block was unreadable.
1575 struct buffer_head *
1576 __bread(struct block_device *bdev, sector_t block, int size)
1578 struct buffer_head *bh = __getblk(bdev, block, size);
1580 if (!buffer_uptodate(bh))
1581 bh = __bread_slow(bh);
1582 return bh;
1584 EXPORT_SYMBOL(__bread);
1587 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1588 * This doesn't race because it runs in each cpu either in irq
1589 * or with preempt disabled.
1591 static void invalidate_bh_lru(void *arg)
1593 struct bh_lru *b = &get_cpu_var(bh_lrus);
1594 int i;
1596 for (i = 0; i < BH_LRU_SIZE; i++) {
1597 brelse(b->bhs[i]);
1598 b->bhs[i] = NULL;
1600 put_cpu_var(bh_lrus);
1603 static void invalidate_bh_lrus(void)
1605 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1608 void set_bh_page(struct buffer_head *bh,
1609 struct page *page, unsigned long offset)
1611 bh->b_page = page;
1612 if (offset >= PAGE_SIZE)
1613 BUG();
1614 if (PageHighMem(page))
1616 * This catches illegal uses and preserves the offset:
1618 bh->b_data = (char *)(0 + offset);
1619 else
1620 bh->b_data = page_address(page) + offset;
1622 EXPORT_SYMBOL(set_bh_page);
1625 * Called when truncating a buffer on a page completely.
1627 static inline void discard_buffer(struct buffer_head * bh)
1629 lock_buffer(bh);
1630 clear_buffer_dirty(bh);
1631 bh->b_bdev = NULL;
1632 clear_buffer_mapped(bh);
1633 clear_buffer_req(bh);
1634 clear_buffer_new(bh);
1635 clear_buffer_delay(bh);
1636 unlock_buffer(bh);
1640 * try_to_release_page() - release old fs-specific metadata on a page
1642 * @page: the page which the kernel is trying to free
1643 * @gfp_mask: memory allocation flags (and I/O mode)
1645 * The address_space is to try to release any data against the page
1646 * (presumably at page->private). If the release was successful, return `1'.
1647 * Otherwise return zero.
1649 * The @gfp_mask argument specifies whether I/O may be performed to release
1650 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1652 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1654 int try_to_release_page(struct page *page, int gfp_mask)
1656 struct address_space * const mapping = page->mapping;
1658 BUG_ON(!PageLocked(page));
1659 if (PageWriteback(page))
1660 return 0;
1662 if (mapping && mapping->a_ops->releasepage)
1663 return mapping->a_ops->releasepage(page, gfp_mask);
1664 return try_to_free_buffers(page);
1666 EXPORT_SYMBOL(try_to_release_page);
1669 * block_invalidatepage - invalidate part of all of a buffer-backed page
1671 * @page: the page which is affected
1672 * @offset: the index of the truncation point
1674 * block_invalidatepage() is called when all or part of the page has become
1675 * invalidatedby a truncate operation.
1677 * block_invalidatepage() does not have to release all buffers, but it must
1678 * ensure that no dirty buffer is left outside @offset and that no I/O
1679 * is underway against any of the blocks which are outside the truncation
1680 * point. Because the caller is about to free (and possibly reuse) those
1681 * blocks on-disk.
1683 int block_invalidatepage(struct page *page, unsigned long offset)
1685 struct buffer_head *head, *bh, *next;
1686 unsigned int curr_off = 0;
1687 int ret = 1;
1689 BUG_ON(!PageLocked(page));
1690 if (!page_has_buffers(page))
1691 goto out;
1693 head = page_buffers(page);
1694 bh = head;
1695 do {
1696 unsigned int next_off = curr_off + bh->b_size;
1697 next = bh->b_this_page;
1700 * is this block fully invalidated?
1702 if (offset <= curr_off)
1703 discard_buffer(bh);
1704 curr_off = next_off;
1705 bh = next;
1706 } while (bh != head);
1709 * We release buffers only if the entire page is being invalidated.
1710 * The get_block cached value has been unconditionally invalidated,
1711 * so real IO is not possible anymore.
1713 if (offset == 0)
1714 ret = try_to_release_page(page, 0);
1715 out:
1716 return ret;
1718 EXPORT_SYMBOL(block_invalidatepage);
1721 * We attach and possibly dirty the buffers atomically wrt
1722 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1723 * is already excluded via the page lock.
1725 void create_empty_buffers(struct page *page,
1726 unsigned long blocksize, unsigned long b_state)
1728 struct buffer_head *bh, *head, *tail;
1730 head = create_buffers(page, blocksize, 1);
1731 bh = head;
1732 do {
1733 bh->b_state |= b_state;
1734 tail = bh;
1735 bh = bh->b_this_page;
1736 } while (bh);
1737 tail->b_this_page = head;
1739 spin_lock(&page->mapping->private_lock);
1740 if (PageUptodate(page) || PageDirty(page)) {
1741 bh = head;
1742 do {
1743 if (PageDirty(page))
1744 set_buffer_dirty(bh);
1745 if (PageUptodate(page))
1746 set_buffer_uptodate(bh);
1747 bh = bh->b_this_page;
1748 } while (bh != head);
1750 __set_page_buffers(page, head);
1751 spin_unlock(&page->mapping->private_lock);
1753 EXPORT_SYMBOL(create_empty_buffers);
1756 * We are taking a block for data and we don't want any output from any
1757 * buffer-cache aliases starting from return from that function and
1758 * until the moment when something will explicitly mark the buffer
1759 * dirty (hopefully that will not happen until we will free that block ;-)
1760 * We don't even need to mark it not-uptodate - nobody can expect
1761 * anything from a newly allocated buffer anyway. We used to used
1762 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1763 * don't want to mark the alias unmapped, for example - it would confuse
1764 * anyone who might pick it with bread() afterwards...
1766 * Also.. Note that bforget() doesn't lock the buffer. So there can
1767 * be writeout I/O going on against recently-freed buffers. We don't
1768 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1769 * only if we really need to. That happens here.
1771 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1773 struct buffer_head *old_bh;
1775 might_sleep();
1777 old_bh = __find_get_block_slow(bdev, block, 0);
1778 if (old_bh) {
1779 clear_buffer_dirty(old_bh);
1780 wait_on_buffer(old_bh);
1781 clear_buffer_req(old_bh);
1782 __brelse(old_bh);
1785 EXPORT_SYMBOL(unmap_underlying_metadata);
1788 * NOTE! All mapped/uptodate combinations are valid:
1790 * Mapped Uptodate Meaning
1792 * No No "unknown" - must do get_block()
1793 * No Yes "hole" - zero-filled
1794 * Yes No "allocated" - allocated on disk, not read in
1795 * Yes Yes "valid" - allocated and up-to-date in memory.
1797 * "Dirty" is valid only with the last case (mapped+uptodate).
1801 * While block_write_full_page is writing back the dirty buffers under
1802 * the page lock, whoever dirtied the buffers may decide to clean them
1803 * again at any time. We handle that by only looking at the buffer
1804 * state inside lock_buffer().
1806 * If block_write_full_page() is called for regular writeback
1807 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1808 * locked buffer. This only can happen if someone has written the buffer
1809 * directly, with submit_bh(). At the address_space level PageWriteback
1810 * prevents this contention from occurring.
1812 static int __block_write_full_page(struct inode *inode, struct page *page,
1813 get_block_t *get_block, struct writeback_control *wbc)
1815 int err;
1816 sector_t block;
1817 sector_t last_block;
1818 struct buffer_head *bh, *head;
1819 int nr_underway = 0;
1821 BUG_ON(!PageLocked(page));
1823 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1825 if (!page_has_buffers(page)) {
1826 create_empty_buffers(page, 1 << inode->i_blkbits,
1827 (1 << BH_Dirty)|(1 << BH_Uptodate));
1831 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1832 * here, and the (potentially unmapped) buffers may become dirty at
1833 * any time. If a buffer becomes dirty here after we've inspected it
1834 * then we just miss that fact, and the page stays dirty.
1836 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1837 * handle that here by just cleaning them.
1840 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1841 head = page_buffers(page);
1842 bh = head;
1845 * Get all the dirty buffers mapped to disk addresses and
1846 * handle any aliases from the underlying blockdev's mapping.
1848 do {
1849 if (block > last_block) {
1851 * mapped buffers outside i_size will occur, because
1852 * this page can be outside i_size when there is a
1853 * truncate in progress.
1856 * The buffer was zeroed by block_write_full_page()
1858 clear_buffer_dirty(bh);
1859 set_buffer_uptodate(bh);
1860 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1861 err = get_block(inode, block, bh, 1);
1862 if (err)
1863 goto recover;
1864 if (buffer_new(bh)) {
1865 /* blockdev mappings never come here */
1866 clear_buffer_new(bh);
1867 unmap_underlying_metadata(bh->b_bdev,
1868 bh->b_blocknr);
1871 bh = bh->b_this_page;
1872 block++;
1873 } while (bh != head);
1875 do {
1876 get_bh(bh);
1877 if (!buffer_mapped(bh))
1878 continue;
1880 * If it's a fully non-blocking write attempt and we cannot
1881 * lock the buffer then redirty the page. Note that this can
1882 * potentially cause a busy-wait loop from pdflush and kswapd
1883 * activity, but those code paths have their own higher-level
1884 * throttling.
1886 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1887 lock_buffer(bh);
1888 } else if (test_set_buffer_locked(bh)) {
1889 redirty_page_for_writepage(wbc, page);
1890 continue;
1892 if (test_clear_buffer_dirty(bh)) {
1893 mark_buffer_async_write(bh);
1894 } else {
1895 unlock_buffer(bh);
1897 } while ((bh = bh->b_this_page) != head);
1900 * The page and its buffers are protected by PageWriteback(), so we can
1901 * drop the bh refcounts early.
1903 BUG_ON(PageWriteback(page));
1904 set_page_writeback(page);
1905 unlock_page(page);
1907 do {
1908 struct buffer_head *next = bh->b_this_page;
1909 if (buffer_async_write(bh)) {
1910 submit_bh(WRITE, bh);
1911 nr_underway++;
1913 put_bh(bh);
1914 bh = next;
1915 } while (bh != head);
1917 err = 0;
1918 done:
1919 if (nr_underway == 0) {
1921 * The page was marked dirty, but the buffers were
1922 * clean. Someone wrote them back by hand with
1923 * ll_rw_block/submit_bh. A rare case.
1925 int uptodate = 1;
1926 do {
1927 if (!buffer_uptodate(bh)) {
1928 uptodate = 0;
1929 break;
1931 bh = bh->b_this_page;
1932 } while (bh != head);
1933 if (uptodate)
1934 SetPageUptodate(page);
1935 end_page_writeback(page);
1937 * The page and buffer_heads can be released at any time from
1938 * here on.
1940 wbc->pages_skipped++; /* We didn't write this page */
1942 return err;
1944 recover:
1946 * ENOSPC, or some other error. We may already have added some
1947 * blocks to the file, so we need to write these out to avoid
1948 * exposing stale data.
1949 * The page is currently locked and not marked for writeback
1951 bh = head;
1952 /* Recovery: lock and submit the mapped buffers */
1953 do {
1954 get_bh(bh);
1955 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1956 lock_buffer(bh);
1957 mark_buffer_async_write(bh);
1958 } else {
1960 * The buffer may have been set dirty during
1961 * attachment to a dirty page.
1963 clear_buffer_dirty(bh);
1965 } while ((bh = bh->b_this_page) != head);
1966 SetPageError(page);
1967 BUG_ON(PageWriteback(page));
1968 set_page_writeback(page);
1969 unlock_page(page);
1970 do {
1971 struct buffer_head *next = bh->b_this_page;
1972 if (buffer_async_write(bh)) {
1973 clear_buffer_dirty(bh);
1974 submit_bh(WRITE, bh);
1975 nr_underway++;
1977 put_bh(bh);
1978 bh = next;
1979 } while (bh != head);
1980 goto done;
1983 static int __block_prepare_write(struct inode *inode, struct page *page,
1984 unsigned from, unsigned to, get_block_t *get_block)
1986 unsigned block_start, block_end;
1987 sector_t block;
1988 int err = 0;
1989 unsigned blocksize, bbits;
1990 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1992 BUG_ON(!PageLocked(page));
1993 BUG_ON(from > PAGE_CACHE_SIZE);
1994 BUG_ON(to > PAGE_CACHE_SIZE);
1995 BUG_ON(from > to);
1997 blocksize = 1 << inode->i_blkbits;
1998 if (!page_has_buffers(page))
1999 create_empty_buffers(page, blocksize, 0);
2000 head = page_buffers(page);
2002 bbits = inode->i_blkbits;
2003 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2005 for(bh = head, block_start = 0; bh != head || !block_start;
2006 block++, block_start=block_end, bh = bh->b_this_page) {
2007 block_end = block_start + blocksize;
2008 if (block_end <= from || block_start >= to) {
2009 if (PageUptodate(page)) {
2010 if (!buffer_uptodate(bh))
2011 set_buffer_uptodate(bh);
2013 continue;
2015 if (buffer_new(bh))
2016 clear_buffer_new(bh);
2017 if (!buffer_mapped(bh)) {
2018 err = get_block(inode, block, bh, 1);
2019 if (err)
2020 goto out;
2021 if (buffer_new(bh)) {
2022 clear_buffer_new(bh);
2023 unmap_underlying_metadata(bh->b_bdev,
2024 bh->b_blocknr);
2025 if (PageUptodate(page)) {
2026 set_buffer_uptodate(bh);
2027 continue;
2029 if (block_end > to || block_start < from) {
2030 void *kaddr;
2032 kaddr = kmap_atomic(page, KM_USER0);
2033 if (block_end > to)
2034 memset(kaddr+to, 0,
2035 block_end-to);
2036 if (block_start < from)
2037 memset(kaddr+block_start,
2038 0, from-block_start);
2039 flush_dcache_page(page);
2040 kunmap_atomic(kaddr, KM_USER0);
2042 continue;
2045 if (PageUptodate(page)) {
2046 if (!buffer_uptodate(bh))
2047 set_buffer_uptodate(bh);
2048 continue;
2050 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2051 (block_start < from || block_end > to)) {
2052 ll_rw_block(READ, 1, &bh);
2053 *wait_bh++=bh;
2057 * If we issued read requests - let them complete.
2059 while(wait_bh > wait) {
2060 wait_on_buffer(*--wait_bh);
2061 if (!buffer_uptodate(*wait_bh))
2062 return -EIO;
2064 return 0;
2065 out:
2067 * Zero out any newly allocated blocks to avoid exposing stale
2068 * data. If BH_New is set, we know that the block was newly
2069 * allocated in the above loop.
2071 bh = head;
2072 block_start = 0;
2073 do {
2074 block_end = block_start+blocksize;
2075 if (block_end <= from)
2076 goto next_bh;
2077 if (block_start >= to)
2078 break;
2079 if (buffer_new(bh)) {
2080 void *kaddr;
2082 clear_buffer_new(bh);
2083 kaddr = kmap_atomic(page, KM_USER0);
2084 memset(kaddr+block_start, 0, bh->b_size);
2085 kunmap_atomic(kaddr, KM_USER0);
2086 set_buffer_uptodate(bh);
2087 mark_buffer_dirty(bh);
2089 next_bh:
2090 block_start = block_end;
2091 bh = bh->b_this_page;
2092 } while (bh != head);
2093 return err;
2096 static int __block_commit_write(struct inode *inode, struct page *page,
2097 unsigned from, unsigned to)
2099 unsigned block_start, block_end;
2100 int partial = 0;
2101 unsigned blocksize;
2102 struct buffer_head *bh, *head;
2104 blocksize = 1 << inode->i_blkbits;
2106 for(bh = head = page_buffers(page), block_start = 0;
2107 bh != head || !block_start;
2108 block_start=block_end, bh = bh->b_this_page) {
2109 block_end = block_start + blocksize;
2110 if (block_end <= from || block_start >= to) {
2111 if (!buffer_uptodate(bh))
2112 partial = 1;
2113 } else {
2114 set_buffer_uptodate(bh);
2115 mark_buffer_dirty(bh);
2120 * If this is a partial write which happened to make all buffers
2121 * uptodate then we can optimize away a bogus readpage() for
2122 * the next read(). Here we 'discover' whether the page went
2123 * uptodate as a result of this (potentially partial) write.
2125 if (!partial)
2126 SetPageUptodate(page);
2127 return 0;
2131 * Generic "read page" function for block devices that have the normal
2132 * get_block functionality. This is most of the block device filesystems.
2133 * Reads the page asynchronously --- the unlock_buffer() and
2134 * set/clear_buffer_uptodate() functions propagate buffer state into the
2135 * page struct once IO has completed.
2137 int block_read_full_page(struct page *page, get_block_t *get_block)
2139 struct inode *inode = page->mapping->host;
2140 sector_t iblock, lblock;
2141 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2142 unsigned int blocksize;
2143 int nr, i;
2144 int fully_mapped = 1;
2146 if (!PageLocked(page))
2147 PAGE_BUG(page);
2148 blocksize = 1 << inode->i_blkbits;
2149 if (!page_has_buffers(page))
2150 create_empty_buffers(page, blocksize, 0);
2151 head = page_buffers(page);
2153 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2154 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2155 bh = head;
2156 nr = 0;
2157 i = 0;
2159 do {
2160 if (buffer_uptodate(bh))
2161 continue;
2163 if (!buffer_mapped(bh)) {
2164 fully_mapped = 0;
2165 if (iblock < lblock) {
2166 if (get_block(inode, iblock, bh, 0))
2167 SetPageError(page);
2169 if (!buffer_mapped(bh)) {
2170 void *kaddr = kmap_atomic(page, KM_USER0);
2171 memset(kaddr + i * blocksize, 0, blocksize);
2172 flush_dcache_page(page);
2173 kunmap_atomic(kaddr, KM_USER0);
2174 set_buffer_uptodate(bh);
2175 continue;
2178 * get_block() might have updated the buffer
2179 * synchronously
2181 if (buffer_uptodate(bh))
2182 continue;
2184 arr[nr++] = bh;
2185 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2187 if (fully_mapped)
2188 SetPageMappedToDisk(page);
2190 if (!nr) {
2192 * All buffers are uptodate - we can set the page uptodate
2193 * as well. But not if get_block() returned an error.
2195 if (!PageError(page))
2196 SetPageUptodate(page);
2197 unlock_page(page);
2198 return 0;
2201 /* Stage two: lock the buffers */
2202 for (i = 0; i < nr; i++) {
2203 bh = arr[i];
2204 lock_buffer(bh);
2205 mark_buffer_async_read(bh);
2209 * Stage 3: start the IO. Check for uptodateness
2210 * inside the buffer lock in case another process reading
2211 * the underlying blockdev brought it uptodate (the sct fix).
2213 for (i = 0; i < nr; i++) {
2214 bh = arr[i];
2215 if (buffer_uptodate(bh))
2216 end_buffer_async_read(bh, 1);
2217 else
2218 submit_bh(READ, bh);
2220 return 0;
2223 /* utility function for filesystems that need to do work on expanding
2224 * truncates. Uses prepare/commit_write to allow the filesystem to
2225 * deal with the hole.
2227 int generic_cont_expand(struct inode *inode, loff_t size)
2229 struct address_space *mapping = inode->i_mapping;
2230 struct page *page;
2231 unsigned long index, offset, limit;
2232 int err;
2234 err = -EFBIG;
2235 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2236 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2237 send_sig(SIGXFSZ, current, 0);
2238 goto out;
2240 if (size > inode->i_sb->s_maxbytes)
2241 goto out;
2243 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2245 /* ugh. in prepare/commit_write, if from==to==start of block, we
2246 ** skip the prepare. make sure we never send an offset for the start
2247 ** of a block
2249 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2250 offset++;
2252 index = size >> PAGE_CACHE_SHIFT;
2253 err = -ENOMEM;
2254 page = grab_cache_page(mapping, index);
2255 if (!page)
2256 goto out;
2257 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2258 if (!err) {
2259 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2261 unlock_page(page);
2262 page_cache_release(page);
2263 if (err > 0)
2264 err = 0;
2265 out:
2266 return err;
2270 * For moronic filesystems that do not allow holes in file.
2271 * We may have to extend the file.
2274 int cont_prepare_write(struct page *page, unsigned offset,
2275 unsigned to, get_block_t *get_block, loff_t *bytes)
2277 struct address_space *mapping = page->mapping;
2278 struct inode *inode = mapping->host;
2279 struct page *new_page;
2280 pgoff_t pgpos;
2281 long status;
2282 unsigned zerofrom;
2283 unsigned blocksize = 1 << inode->i_blkbits;
2284 void *kaddr;
2286 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2287 status = -ENOMEM;
2288 new_page = grab_cache_page(mapping, pgpos);
2289 if (!new_page)
2290 goto out;
2291 /* we might sleep */
2292 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2293 unlock_page(new_page);
2294 page_cache_release(new_page);
2295 continue;
2297 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2298 if (zerofrom & (blocksize-1)) {
2299 *bytes |= (blocksize-1);
2300 (*bytes)++;
2302 status = __block_prepare_write(inode, new_page, zerofrom,
2303 PAGE_CACHE_SIZE, get_block);
2304 if (status)
2305 goto out_unmap;
2306 kaddr = kmap_atomic(new_page, KM_USER0);
2307 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2308 flush_dcache_page(new_page);
2309 kunmap_atomic(kaddr, KM_USER0);
2310 __block_commit_write(inode, new_page,
2311 zerofrom, PAGE_CACHE_SIZE);
2312 unlock_page(new_page);
2313 page_cache_release(new_page);
2316 if (page->index < pgpos) {
2317 /* completely inside the area */
2318 zerofrom = offset;
2319 } else {
2320 /* page covers the boundary, find the boundary offset */
2321 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2323 /* if we will expand the thing last block will be filled */
2324 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2325 *bytes |= (blocksize-1);
2326 (*bytes)++;
2329 /* starting below the boundary? Nothing to zero out */
2330 if (offset <= zerofrom)
2331 zerofrom = offset;
2333 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2334 if (status)
2335 goto out1;
2336 if (zerofrom < offset) {
2337 kaddr = kmap_atomic(page, KM_USER0);
2338 memset(kaddr+zerofrom, 0, offset-zerofrom);
2339 flush_dcache_page(page);
2340 kunmap_atomic(kaddr, KM_USER0);
2341 __block_commit_write(inode, page, zerofrom, offset);
2343 return 0;
2344 out1:
2345 ClearPageUptodate(page);
2346 return status;
2348 out_unmap:
2349 ClearPageUptodate(new_page);
2350 unlock_page(new_page);
2351 page_cache_release(new_page);
2352 out:
2353 return status;
2356 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2357 get_block_t *get_block)
2359 struct inode *inode = page->mapping->host;
2360 int err = __block_prepare_write(inode, page, from, to, get_block);
2361 if (err)
2362 ClearPageUptodate(page);
2363 return err;
2366 int block_commit_write(struct page *page, unsigned from, unsigned to)
2368 struct inode *inode = page->mapping->host;
2369 __block_commit_write(inode,page,from,to);
2370 return 0;
2373 int generic_commit_write(struct file *file, struct page *page,
2374 unsigned from, unsigned to)
2376 struct inode *inode = page->mapping->host;
2377 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2378 __block_commit_write(inode,page,from,to);
2380 * No need to use i_size_read() here, the i_size
2381 * cannot change under us because we hold i_sem.
2383 if (pos > inode->i_size) {
2384 i_size_write(inode, pos);
2385 mark_inode_dirty(inode);
2387 return 0;
2392 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2393 * immediately, while under the page lock. So it needs a special end_io
2394 * handler which does not touch the bh after unlocking it.
2396 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2397 * a race there is benign: unlock_buffer() only use the bh's address for
2398 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2399 * itself.
2401 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2403 if (uptodate) {
2404 set_buffer_uptodate(bh);
2405 } else {
2406 /* This happens, due to failed READA attempts. */
2407 clear_buffer_uptodate(bh);
2409 unlock_buffer(bh);
2413 * On entry, the page is fully not uptodate.
2414 * On exit the page is fully uptodate in the areas outside (from,to)
2416 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2417 get_block_t *get_block)
2419 struct inode *inode = page->mapping->host;
2420 const unsigned blkbits = inode->i_blkbits;
2421 const unsigned blocksize = 1 << blkbits;
2422 struct buffer_head map_bh;
2423 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2424 unsigned block_in_page;
2425 unsigned block_start;
2426 sector_t block_in_file;
2427 char *kaddr;
2428 int nr_reads = 0;
2429 int i;
2430 int ret = 0;
2431 int is_mapped_to_disk = 1;
2432 int dirtied_it = 0;
2434 if (PageMappedToDisk(page))
2435 return 0;
2437 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2438 map_bh.b_page = page;
2441 * We loop across all blocks in the page, whether or not they are
2442 * part of the affected region. This is so we can discover if the
2443 * page is fully mapped-to-disk.
2445 for (block_start = 0, block_in_page = 0;
2446 block_start < PAGE_CACHE_SIZE;
2447 block_in_page++, block_start += blocksize) {
2448 unsigned block_end = block_start + blocksize;
2449 int create;
2451 map_bh.b_state = 0;
2452 create = 1;
2453 if (block_start >= to)
2454 create = 0;
2455 ret = get_block(inode, block_in_file + block_in_page,
2456 &map_bh, create);
2457 if (ret)
2458 goto failed;
2459 if (!buffer_mapped(&map_bh))
2460 is_mapped_to_disk = 0;
2461 if (buffer_new(&map_bh))
2462 unmap_underlying_metadata(map_bh.b_bdev,
2463 map_bh.b_blocknr);
2464 if (PageUptodate(page))
2465 continue;
2466 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2467 kaddr = kmap_atomic(page, KM_USER0);
2468 if (block_start < from) {
2469 memset(kaddr+block_start, 0, from-block_start);
2470 dirtied_it = 1;
2472 if (block_end > to) {
2473 memset(kaddr + to, 0, block_end - to);
2474 dirtied_it = 1;
2476 flush_dcache_page(page);
2477 kunmap_atomic(kaddr, KM_USER0);
2478 continue;
2480 if (buffer_uptodate(&map_bh))
2481 continue; /* reiserfs does this */
2482 if (block_start < from || block_end > to) {
2483 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2485 if (!bh) {
2486 ret = -ENOMEM;
2487 goto failed;
2489 bh->b_state = map_bh.b_state;
2490 atomic_set(&bh->b_count, 0);
2491 bh->b_this_page = NULL;
2492 bh->b_page = page;
2493 bh->b_blocknr = map_bh.b_blocknr;
2494 bh->b_size = blocksize;
2495 bh->b_data = (char *)(long)block_start;
2496 bh->b_bdev = map_bh.b_bdev;
2497 bh->b_private = NULL;
2498 read_bh[nr_reads++] = bh;
2502 if (nr_reads) {
2503 struct buffer_head *bh;
2506 * The page is locked, so these buffers are protected from
2507 * any VM or truncate activity. Hence we don't need to care
2508 * for the buffer_head refcounts.
2510 for (i = 0; i < nr_reads; i++) {
2511 bh = read_bh[i];
2512 lock_buffer(bh);
2513 bh->b_end_io = end_buffer_read_nobh;
2514 submit_bh(READ, bh);
2516 for (i = 0; i < nr_reads; i++) {
2517 bh = read_bh[i];
2518 wait_on_buffer(bh);
2519 if (!buffer_uptodate(bh))
2520 ret = -EIO;
2521 free_buffer_head(bh);
2522 read_bh[i] = NULL;
2524 if (ret)
2525 goto failed;
2528 if (is_mapped_to_disk)
2529 SetPageMappedToDisk(page);
2530 SetPageUptodate(page);
2533 * Setting the page dirty here isn't necessary for the prepare_write
2534 * function - commit_write will do that. But if/when this function is
2535 * used within the pagefault handler to ensure that all mmapped pages
2536 * have backing space in the filesystem, we will need to dirty the page
2537 * if its contents were altered.
2539 if (dirtied_it)
2540 set_page_dirty(page);
2542 return 0;
2544 failed:
2545 for (i = 0; i < nr_reads; i++) {
2546 if (read_bh[i])
2547 free_buffer_head(read_bh[i]);
2551 * Error recovery is pretty slack. Clear the page and mark it dirty
2552 * so we'll later zero out any blocks which _were_ allocated.
2554 kaddr = kmap_atomic(page, KM_USER0);
2555 memset(kaddr, 0, PAGE_CACHE_SIZE);
2556 kunmap_atomic(kaddr, KM_USER0);
2557 SetPageUptodate(page);
2558 set_page_dirty(page);
2559 return ret;
2561 EXPORT_SYMBOL(nobh_prepare_write);
2563 int nobh_commit_write(struct file *file, struct page *page,
2564 unsigned from, unsigned to)
2566 struct inode *inode = page->mapping->host;
2567 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2569 set_page_dirty(page);
2570 if (pos > inode->i_size) {
2571 i_size_write(inode, pos);
2572 mark_inode_dirty(inode);
2574 return 0;
2576 EXPORT_SYMBOL(nobh_commit_write);
2579 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2581 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2583 struct inode *inode = mapping->host;
2584 unsigned blocksize = 1 << inode->i_blkbits;
2585 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2586 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2587 unsigned to;
2588 struct page *page;
2589 struct address_space_operations *a_ops = mapping->a_ops;
2590 char *kaddr;
2591 int ret = 0;
2593 if ((offset & (blocksize - 1)) == 0)
2594 goto out;
2596 ret = -ENOMEM;
2597 page = grab_cache_page(mapping, index);
2598 if (!page)
2599 goto out;
2601 to = (offset + blocksize) & ~(blocksize - 1);
2602 ret = a_ops->prepare_write(NULL, page, offset, to);
2603 if (ret == 0) {
2604 kaddr = kmap_atomic(page, KM_USER0);
2605 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2606 flush_dcache_page(page);
2607 kunmap_atomic(kaddr, KM_USER0);
2608 set_page_dirty(page);
2610 unlock_page(page);
2611 page_cache_release(page);
2612 out:
2613 return ret;
2615 EXPORT_SYMBOL(nobh_truncate_page);
2617 int block_truncate_page(struct address_space *mapping,
2618 loff_t from, get_block_t *get_block)
2620 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2621 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2622 unsigned blocksize;
2623 pgoff_t iblock;
2624 unsigned length, pos;
2625 struct inode *inode = mapping->host;
2626 struct page *page;
2627 struct buffer_head *bh;
2628 void *kaddr;
2629 int err;
2631 blocksize = 1 << inode->i_blkbits;
2632 length = offset & (blocksize - 1);
2634 /* Block boundary? Nothing to do */
2635 if (!length)
2636 return 0;
2638 length = blocksize - length;
2639 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2641 page = grab_cache_page(mapping, index);
2642 err = -ENOMEM;
2643 if (!page)
2644 goto out;
2646 if (!page_has_buffers(page))
2647 create_empty_buffers(page, blocksize, 0);
2649 /* Find the buffer that contains "offset" */
2650 bh = page_buffers(page);
2651 pos = blocksize;
2652 while (offset >= pos) {
2653 bh = bh->b_this_page;
2654 iblock++;
2655 pos += blocksize;
2658 err = 0;
2659 if (!buffer_mapped(bh)) {
2660 err = get_block(inode, iblock, bh, 0);
2661 if (err)
2662 goto unlock;
2663 /* unmapped? It's a hole - nothing to do */
2664 if (!buffer_mapped(bh))
2665 goto unlock;
2668 /* Ok, it's mapped. Make sure it's up-to-date */
2669 if (PageUptodate(page))
2670 set_buffer_uptodate(bh);
2672 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2673 err = -EIO;
2674 ll_rw_block(READ, 1, &bh);
2675 wait_on_buffer(bh);
2676 /* Uhhuh. Read error. Complain and punt. */
2677 if (!buffer_uptodate(bh))
2678 goto unlock;
2681 kaddr = kmap_atomic(page, KM_USER0);
2682 memset(kaddr + offset, 0, length);
2683 flush_dcache_page(page);
2684 kunmap_atomic(kaddr, KM_USER0);
2686 mark_buffer_dirty(bh);
2687 err = 0;
2689 unlock:
2690 unlock_page(page);
2691 page_cache_release(page);
2692 out:
2693 return err;
2697 * The generic ->writepage function for buffer-backed address_spaces
2699 int block_write_full_page(struct page *page, get_block_t *get_block,
2700 struct writeback_control *wbc)
2702 struct inode * const inode = page->mapping->host;
2703 loff_t i_size = i_size_read(inode);
2704 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2705 unsigned offset;
2706 void *kaddr;
2708 /* Is the page fully inside i_size? */
2709 if (page->index < end_index)
2710 return __block_write_full_page(inode, page, get_block, wbc);
2712 /* Is the page fully outside i_size? (truncate in progress) */
2713 offset = i_size & (PAGE_CACHE_SIZE-1);
2714 if (page->index >= end_index+1 || !offset) {
2716 * The page may have dirty, unmapped buffers. For example,
2717 * they may have been added in ext3_writepage(). Make them
2718 * freeable here, so the page does not leak.
2720 block_invalidatepage(page, 0);
2721 unlock_page(page);
2722 return 0; /* don't care */
2726 * The page straddles i_size. It must be zeroed out on each and every
2727 * writepage invokation because it may be mmapped. "A file is mapped
2728 * in multiples of the page size. For a file that is not a multiple of
2729 * the page size, the remaining memory is zeroed when mapped, and
2730 * writes to that region are not written out to the file."
2732 kaddr = kmap_atomic(page, KM_USER0);
2733 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2734 flush_dcache_page(page);
2735 kunmap_atomic(kaddr, KM_USER0);
2736 return __block_write_full_page(inode, page, get_block, wbc);
2739 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2740 get_block_t *get_block)
2742 struct buffer_head tmp;
2743 struct inode *inode = mapping->host;
2744 tmp.b_state = 0;
2745 tmp.b_blocknr = 0;
2746 get_block(inode, block, &tmp, 0);
2747 return tmp.b_blocknr;
2750 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2752 struct buffer_head *bh = bio->bi_private;
2754 if (bio->bi_size)
2755 return 1;
2757 if (err == -EOPNOTSUPP) {
2758 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2759 set_bit(BH_Eopnotsupp, &bh->b_state);
2762 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2763 bio_put(bio);
2764 return 0;
2767 int submit_bh(int rw, struct buffer_head * bh)
2769 struct bio *bio;
2770 int ret = 0;
2772 BUG_ON(!buffer_locked(bh));
2773 BUG_ON(!buffer_mapped(bh));
2774 BUG_ON(!bh->b_end_io);
2776 if (buffer_ordered(bh) && (rw == WRITE))
2777 rw = WRITE_BARRIER;
2780 * Only clear out a write error when rewriting, should this
2781 * include WRITE_SYNC as well?
2783 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2784 clear_buffer_write_io_error(bh);
2787 * from here on down, it's all bio -- do the initial mapping,
2788 * submit_bio -> generic_make_request may further map this bio around
2790 bio = bio_alloc(GFP_NOIO, 1);
2792 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2793 bio->bi_bdev = bh->b_bdev;
2794 bio->bi_io_vec[0].bv_page = bh->b_page;
2795 bio->bi_io_vec[0].bv_len = bh->b_size;
2796 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2798 bio->bi_vcnt = 1;
2799 bio->bi_idx = 0;
2800 bio->bi_size = bh->b_size;
2802 bio->bi_end_io = end_bio_bh_io_sync;
2803 bio->bi_private = bh;
2805 bio_get(bio);
2806 submit_bio(rw, bio);
2808 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2809 ret = -EOPNOTSUPP;
2811 bio_put(bio);
2812 return ret;
2816 * ll_rw_block: low-level access to block devices (DEPRECATED)
2817 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2818 * @nr: number of &struct buffer_heads in the array
2819 * @bhs: array of pointers to &struct buffer_head
2821 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2822 * and requests an I/O operation on them, either a %READ or a %WRITE.
2823 * The third %READA option is described in the documentation for
2824 * generic_make_request() which ll_rw_block() calls.
2826 * This function drops any buffer that it cannot get a lock on (with the
2827 * BH_Lock state bit), any buffer that appears to be clean when doing a
2828 * write request, and any buffer that appears to be up-to-date when doing
2829 * read request. Further it marks as clean buffers that are processed for
2830 * writing (the buffer cache won't assume that they are actually clean until
2831 * the buffer gets unlocked).
2833 * ll_rw_block sets b_end_io to simple completion handler that marks
2834 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2835 * any waiters.
2837 * All of the buffers must be for the same device, and must also be a
2838 * multiple of the current approved size for the device.
2840 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2842 int i;
2844 for (i = 0; i < nr; i++) {
2845 struct buffer_head *bh = bhs[i];
2847 if (test_set_buffer_locked(bh))
2848 continue;
2850 get_bh(bh);
2851 if (rw == WRITE) {
2852 bh->b_end_io = end_buffer_write_sync;
2853 if (test_clear_buffer_dirty(bh)) {
2854 submit_bh(WRITE, bh);
2855 continue;
2857 } else {
2858 bh->b_end_io = end_buffer_read_sync;
2859 if (!buffer_uptodate(bh)) {
2860 submit_bh(rw, bh);
2861 continue;
2864 unlock_buffer(bh);
2865 put_bh(bh);
2870 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2871 * and then start new I/O and then wait upon it. The caller must have a ref on
2872 * the buffer_head.
2874 int sync_dirty_buffer(struct buffer_head *bh)
2876 int ret = 0;
2878 WARN_ON(atomic_read(&bh->b_count) < 1);
2879 lock_buffer(bh);
2880 if (test_clear_buffer_dirty(bh)) {
2881 get_bh(bh);
2882 bh->b_end_io = end_buffer_write_sync;
2883 ret = submit_bh(WRITE, bh);
2884 wait_on_buffer(bh);
2885 if (buffer_eopnotsupp(bh)) {
2886 clear_buffer_eopnotsupp(bh);
2887 ret = -EOPNOTSUPP;
2889 if (!ret && !buffer_uptodate(bh))
2890 ret = -EIO;
2891 } else {
2892 unlock_buffer(bh);
2894 return ret;
2898 * try_to_free_buffers() checks if all the buffers on this particular page
2899 * are unused, and releases them if so.
2901 * Exclusion against try_to_free_buffers may be obtained by either
2902 * locking the page or by holding its mapping's private_lock.
2904 * If the page is dirty but all the buffers are clean then we need to
2905 * be sure to mark the page clean as well. This is because the page
2906 * may be against a block device, and a later reattachment of buffers
2907 * to a dirty page will set *all* buffers dirty. Which would corrupt
2908 * filesystem data on the same device.
2910 * The same applies to regular filesystem pages: if all the buffers are
2911 * clean then we set the page clean and proceed. To do that, we require
2912 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2913 * private_lock.
2915 * try_to_free_buffers() is non-blocking.
2917 static inline int buffer_busy(struct buffer_head *bh)
2919 return atomic_read(&bh->b_count) |
2920 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2923 static int
2924 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2926 struct buffer_head *head = page_buffers(page);
2927 struct buffer_head *bh;
2929 bh = head;
2930 do {
2931 if (buffer_write_io_error(bh))
2932 set_bit(AS_EIO, &page->mapping->flags);
2933 if (buffer_busy(bh))
2934 goto failed;
2935 bh = bh->b_this_page;
2936 } while (bh != head);
2938 do {
2939 struct buffer_head *next = bh->b_this_page;
2941 if (!list_empty(&bh->b_assoc_buffers))
2942 __remove_assoc_queue(bh);
2943 bh = next;
2944 } while (bh != head);
2945 *buffers_to_free = head;
2946 __clear_page_buffers(page);
2947 return 1;
2948 failed:
2949 return 0;
2952 int try_to_free_buffers(struct page *page)
2954 struct address_space * const mapping = page->mapping;
2955 struct buffer_head *buffers_to_free = NULL;
2956 int ret = 0;
2958 BUG_ON(!PageLocked(page));
2959 if (PageWriteback(page))
2960 return 0;
2962 if (mapping == NULL) { /* can this still happen? */
2963 ret = drop_buffers(page, &buffers_to_free);
2964 goto out;
2967 spin_lock(&mapping->private_lock);
2968 ret = drop_buffers(page, &buffers_to_free);
2969 if (ret) {
2971 * If the filesystem writes its buffers by hand (eg ext3)
2972 * then we can have clean buffers against a dirty page. We
2973 * clean the page here; otherwise later reattachment of buffers
2974 * could encounter a non-uptodate page, which is unresolvable.
2975 * This only applies in the rare case where try_to_free_buffers
2976 * succeeds but the page is not freed.
2978 clear_page_dirty(page);
2980 spin_unlock(&mapping->private_lock);
2981 out:
2982 if (buffers_to_free) {
2983 struct buffer_head *bh = buffers_to_free;
2985 do {
2986 struct buffer_head *next = bh->b_this_page;
2987 free_buffer_head(bh);
2988 bh = next;
2989 } while (bh != buffers_to_free);
2991 return ret;
2993 EXPORT_SYMBOL(try_to_free_buffers);
2995 int block_sync_page(struct page *page)
2997 struct address_space *mapping;
2999 smp_mb();
3000 mapping = page_mapping(page);
3001 if (mapping)
3002 blk_run_backing_dev(mapping->backing_dev_info, page);
3003 return 0;
3007 * There are no bdflush tunables left. But distributions are
3008 * still running obsolete flush daemons, so we terminate them here.
3010 * Use of bdflush() is deprecated and will be removed in a future kernel.
3011 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3013 asmlinkage long sys_bdflush(int func, long data)
3015 static int msg_count;
3017 if (!capable(CAP_SYS_ADMIN))
3018 return -EPERM;
3020 if (msg_count < 5) {
3021 msg_count++;
3022 printk(KERN_INFO
3023 "warning: process `%s' used the obsolete bdflush"
3024 " system call\n", current->comm);
3025 printk(KERN_INFO "Fix your initscripts?\n");
3028 if (func == 1)
3029 do_exit(0);
3030 return 0;
3034 * Buffer-head allocation
3036 static kmem_cache_t *bh_cachep;
3039 * Once the number of bh's in the machine exceeds this level, we start
3040 * stripping them in writeback.
3042 static int max_buffer_heads;
3044 int buffer_heads_over_limit;
3046 struct bh_accounting {
3047 int nr; /* Number of live bh's */
3048 int ratelimit; /* Limit cacheline bouncing */
3051 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3053 static void recalc_bh_state(void)
3055 int i;
3056 int tot = 0;
3058 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3059 return;
3060 __get_cpu_var(bh_accounting).ratelimit = 0;
3061 for_each_cpu(i)
3062 tot += per_cpu(bh_accounting, i).nr;
3063 buffer_heads_over_limit = (tot > max_buffer_heads);
3066 struct buffer_head *alloc_buffer_head(int gfp_flags)
3068 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3069 if (ret) {
3070 preempt_disable();
3071 __get_cpu_var(bh_accounting).nr++;
3072 recalc_bh_state();
3073 preempt_enable();
3075 return ret;
3077 EXPORT_SYMBOL(alloc_buffer_head);
3079 void free_buffer_head(struct buffer_head *bh)
3081 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3082 kmem_cache_free(bh_cachep, bh);
3083 preempt_disable();
3084 __get_cpu_var(bh_accounting).nr--;
3085 recalc_bh_state();
3086 preempt_enable();
3088 EXPORT_SYMBOL(free_buffer_head);
3090 static void
3091 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3093 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3094 SLAB_CTOR_CONSTRUCTOR) {
3095 struct buffer_head * bh = (struct buffer_head *)data;
3097 memset(bh, 0, sizeof(*bh));
3098 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3102 #ifdef CONFIG_HOTPLUG_CPU
3103 static void buffer_exit_cpu(int cpu)
3105 int i;
3106 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3108 for (i = 0; i < BH_LRU_SIZE; i++) {
3109 brelse(b->bhs[i]);
3110 b->bhs[i] = NULL;
3114 static int buffer_cpu_notify(struct notifier_block *self,
3115 unsigned long action, void *hcpu)
3117 if (action == CPU_DEAD)
3118 buffer_exit_cpu((unsigned long)hcpu);
3119 return NOTIFY_OK;
3121 #endif /* CONFIG_HOTPLUG_CPU */
3123 void __init buffer_init(void)
3125 int i;
3126 int nrpages;
3128 bh_cachep = kmem_cache_create("buffer_head",
3129 sizeof(struct buffer_head), 0,
3130 SLAB_PANIC, init_buffer_head, NULL);
3131 for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3132 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3135 * Limit the bh occupancy to 10% of ZONE_NORMAL
3137 nrpages = (nr_free_buffer_pages() * 10) / 100;
3138 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3139 hotcpu_notifier(buffer_cpu_notify, 0);
3142 #ifdef CONFIG_MAGIC_ROM_PTR
3143 /* should be an inline or a macro, but bd_disk`s type is unknown where we use bromptr */
3144 int bromptr(struct block_device *bdev, struct vm_area_struct * vma)
3146 int (*romptr) (struct block_device *, struct vm_area_struct *) =
3147 bdev->bd_disk->fops->romptr;
3148 if (romptr)
3149 return romptr(bdev, vma);
3150 return -ENOSYS;
3152 #endif /* CONFIG_MAGIC_ROM_PTR */
3154 EXPORT_SYMBOL(__bforget);
3155 EXPORT_SYMBOL(__brelse);
3156 EXPORT_SYMBOL(__wait_on_buffer);
3157 EXPORT_SYMBOL(block_commit_write);
3158 EXPORT_SYMBOL(block_prepare_write);
3159 EXPORT_SYMBOL(block_read_full_page);
3160 EXPORT_SYMBOL(block_sync_page);
3161 EXPORT_SYMBOL(block_truncate_page);
3162 EXPORT_SYMBOL(block_write_full_page);
3163 EXPORT_SYMBOL(cont_prepare_write);
3164 EXPORT_SYMBOL(end_buffer_async_write);
3165 EXPORT_SYMBOL(end_buffer_read_sync);
3166 EXPORT_SYMBOL(end_buffer_write_sync);
3167 EXPORT_SYMBOL(file_fsync);
3168 EXPORT_SYMBOL(fsync_bdev);
3169 EXPORT_SYMBOL(generic_block_bmap);
3170 EXPORT_SYMBOL(generic_commit_write);
3171 EXPORT_SYMBOL(generic_cont_expand);
3172 EXPORT_SYMBOL(init_buffer);
3173 EXPORT_SYMBOL(invalidate_bdev);
3174 EXPORT_SYMBOL(ll_rw_block);
3175 EXPORT_SYMBOL(mark_buffer_dirty);
3176 EXPORT_SYMBOL(submit_bh);
3177 EXPORT_SYMBOL(sync_dirty_buffer);
3178 EXPORT_SYMBOL(unlock_buffer);