[PATCH] Add a driver for the Technisat Skystar2 DVB card
[linux-2.6/history.git] / fs / buffer.c
blob085373399b4368de6c0174f3af43388f79a46dfa
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/bio.h>
38 #include <linux/notifier.h>
39 #include <linux/cpu.h>
40 #include <asm/bitops.h>
42 static void invalidate_bh_lrus(void);
44 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
47 * Hashed waitqueue_head's for wait_on_buffer()
49 #define BH_WAIT_TABLE_ORDER 7
50 static struct bh_wait_queue_head {
51 wait_queue_head_t wqh;
52 } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
55 * Debug/devel support stuff
58 void __buffer_error(char *file, int line)
60 static int enough;
62 if (enough > 10)
63 return;
64 enough++;
65 printk("buffer layer error at %s:%d\n", file, line);
66 #ifndef CONFIG_KALLSYMS
67 printk("Pass this trace through ksymoops for reporting\n");
68 #endif
69 dump_stack();
71 EXPORT_SYMBOL(__buffer_error);
73 inline void
74 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
76 bh->b_end_io = handler;
77 bh->b_private = private;
81 * Return the address of the waitqueue_head to be used for this
82 * buffer_head
84 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
86 return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
88 EXPORT_SYMBOL(bh_waitq_head);
90 void wake_up_buffer(struct buffer_head *bh)
92 wait_queue_head_t *wq = bh_waitq_head(bh);
94 if (waitqueue_active(wq))
95 wake_up_all(wq);
97 EXPORT_SYMBOL(wake_up_buffer);
99 void unlock_buffer(struct buffer_head *bh)
102 * unlock_buffer against a zero-count bh is a bug, if the page
103 * is not locked. Because then nothing protects the buffer's
104 * waitqueue, which is used here. (Well. Other locked buffers
105 * against the page will pin it. But complain anyway).
107 if (atomic_read(&bh->b_count) == 0 &&
108 !PageLocked(bh->b_page) &&
109 !PageWriteback(bh->b_page))
110 buffer_error();
112 clear_buffer_locked(bh);
113 smp_mb__after_clear_bit();
114 wake_up_buffer(bh);
118 * Block until a buffer comes unlocked. This doesn't stop it
119 * from becoming locked again - you have to lock it yourself
120 * if you want to preserve its state.
122 void __wait_on_buffer(struct buffer_head * bh)
124 wait_queue_head_t *wqh = bh_waitq_head(bh);
125 DEFINE_WAIT(wait);
127 if (atomic_read(&bh->b_count) == 0 &&
128 (!bh->b_page || !PageLocked(bh->b_page)))
129 buffer_error();
131 do {
132 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
133 if (buffer_locked(bh)) {
134 blk_run_queues();
135 io_schedule();
137 } while (buffer_locked(bh));
138 finish_wait(wqh, &wait);
141 static void
142 __set_page_buffers(struct page *page, struct buffer_head *head)
144 if (page_has_buffers(page))
145 buffer_error();
146 page_cache_get(page);
147 SetPagePrivate(page);
148 page->private = (unsigned long)head;
151 static void
152 __clear_page_buffers(struct page *page)
154 ClearPagePrivate(page);
155 page->private = 0;
156 page_cache_release(page);
159 static void buffer_io_error(struct buffer_head *bh)
161 char b[BDEVNAME_SIZE];
163 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
164 bdevname(bh->b_bdev, b),
165 (unsigned long long)bh->b_blocknr);
169 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
170 * unlock the buffer. This is what ll_rw_block uses too.
172 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
174 if (uptodate) {
175 set_buffer_uptodate(bh);
176 } else {
178 * This happens, due to failed READA attempts.
179 * buffer_io_error(bh);
181 clear_buffer_uptodate(bh);
183 unlock_buffer(bh);
184 put_bh(bh);
188 * Write out and wait upon all the dirty data associated with a block
189 * device via its mapping. Does not take the superblock lock.
191 int sync_blockdev(struct block_device *bdev)
193 int ret = 0;
195 if (bdev) {
196 int err;
198 ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
199 err = filemap_fdatawait(bdev->bd_inode->i_mapping);
200 if (!ret)
201 ret = err;
203 return ret;
205 EXPORT_SYMBOL(sync_blockdev);
208 * Write out and wait upon all dirty data associated with this
209 * superblock. Filesystem data as well as the underlying block
210 * device. Takes the superblock lock.
212 int fsync_super(struct super_block *sb)
214 sync_inodes_sb(sb, 0);
215 DQUOT_SYNC(sb);
216 lock_super(sb);
217 if (sb->s_dirt && sb->s_op->write_super)
218 sb->s_op->write_super(sb);
219 unlock_super(sb);
220 if (sb->s_op->sync_fs)
221 sb->s_op->sync_fs(sb, 1);
222 sync_blockdev(sb->s_bdev);
223 sync_inodes_sb(sb, 1);
225 return sync_blockdev(sb->s_bdev);
229 * Write out and wait upon all dirty data associated with this
230 * device. Filesystem data as well as the underlying block
231 * device. Takes the superblock lock.
233 int fsync_bdev(struct block_device *bdev)
235 struct super_block *sb = get_super(bdev);
236 if (sb) {
237 int res = fsync_super(sb);
238 drop_super(sb);
239 return res;
241 return sync_blockdev(bdev);
245 * sync everything. Start out by waking pdflush, because that writes back
246 * all queues in parallel.
248 static void do_sync(unsigned long wait)
250 wakeup_bdflush(0);
251 sync_inodes(0); /* All mappings, inodes and their blockdevs */
252 DQUOT_SYNC(NULL);
253 sync_supers(); /* Write the superblocks */
254 sync_filesystems(0); /* Start syncing the filesystems */
255 sync_filesystems(wait); /* Waitingly sync the filesystems */
256 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
257 if (!wait)
258 printk("Emergency Sync complete\n");
261 asmlinkage long sys_sync(void)
263 do_sync(1);
264 return 0;
267 void emergency_sync(void)
269 pdflush_operation(do_sync, 0);
273 * Generic function to fsync a file.
275 * filp may be NULL if called via the msync of a vma.
278 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
280 struct inode * inode = dentry->d_inode;
281 struct super_block * sb;
282 int ret;
284 /* sync the inode to buffers */
285 write_inode_now(inode, 0);
287 /* sync the superblock to buffers */
288 sb = inode->i_sb;
289 lock_super(sb);
290 if (sb->s_op->write_super)
291 sb->s_op->write_super(sb);
292 unlock_super(sb);
294 /* .. finally sync the buffers to disk */
295 ret = sync_blockdev(sb->s_bdev);
296 return ret;
299 asmlinkage long sys_fsync(unsigned int fd)
301 struct file * file;
302 struct dentry * dentry;
303 struct inode * inode;
304 int ret, err;
306 ret = -EBADF;
307 file = fget(fd);
308 if (!file)
309 goto out;
311 dentry = file->f_dentry;
312 inode = dentry->d_inode;
314 ret = -EINVAL;
315 if (!file->f_op || !file->f_op->fsync) {
316 /* Why? We can still call filemap_fdatawrite */
317 goto out_putf;
320 /* We need to protect against concurrent writers.. */
321 down(&inode->i_sem);
322 current->flags |= PF_SYNCWRITE;
323 ret = filemap_fdatawrite(inode->i_mapping);
324 err = file->f_op->fsync(file, dentry, 0);
325 if (!ret)
326 ret = err;
327 err = filemap_fdatawait(inode->i_mapping);
328 if (!ret)
329 ret = err;
330 current->flags &= ~PF_SYNCWRITE;
331 up(&inode->i_sem);
333 out_putf:
334 fput(file);
335 out:
336 return ret;
339 asmlinkage long sys_fdatasync(unsigned int fd)
341 struct file * file;
342 struct dentry * dentry;
343 struct inode * inode;
344 int ret, err;
346 ret = -EBADF;
347 file = fget(fd);
348 if (!file)
349 goto out;
351 dentry = file->f_dentry;
352 inode = dentry->d_inode;
354 ret = -EINVAL;
355 if (!file->f_op || !file->f_op->fsync)
356 goto out_putf;
358 down(&inode->i_sem);
359 current->flags |= PF_SYNCWRITE;
360 ret = filemap_fdatawrite(inode->i_mapping);
361 err = file->f_op->fsync(file, dentry, 1);
362 if (!ret)
363 ret = err;
364 err = filemap_fdatawait(inode->i_mapping);
365 if (!ret)
366 ret = err;
367 current->flags &= ~PF_SYNCWRITE;
368 up(&inode->i_sem);
370 out_putf:
371 fput(file);
372 out:
373 return ret;
377 * Various filesystems appear to want __find_get_block to be non-blocking.
378 * But it's the page lock which protects the buffers. To get around this,
379 * we get exclusion from try_to_free_buffers with the blockdev mapping's
380 * private_lock.
382 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
383 * may be quite high. This code could TryLock the page, and if that
384 * succeeds, there is no need to take private_lock. (But if
385 * private_lock is contended then so is mapping->page_lock).
387 static struct buffer_head *
388 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
390 struct inode *bd_inode = bdev->bd_inode;
391 struct address_space *bd_mapping = bd_inode->i_mapping;
392 struct buffer_head *ret = NULL;
393 unsigned long index;
394 struct buffer_head *bh;
395 struct buffer_head *head;
396 struct page *page;
398 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
399 page = find_get_page(bd_mapping, index);
400 if (!page)
401 goto out;
403 spin_lock(&bd_mapping->private_lock);
404 if (!page_has_buffers(page))
405 goto out_unlock;
406 head = page_buffers(page);
407 bh = head;
408 do {
409 if (bh->b_blocknr == block) {
410 ret = bh;
411 get_bh(bh);
412 goto out_unlock;
414 bh = bh->b_this_page;
415 } while (bh != head);
416 buffer_error();
417 out_unlock:
418 spin_unlock(&bd_mapping->private_lock);
419 page_cache_release(page);
420 out:
421 return ret;
424 /* If invalidate_buffers() will trash dirty buffers, it means some kind
425 of fs corruption is going on. Trashing dirty data always imply losing
426 information that was supposed to be just stored on the physical layer
427 by the user.
429 Thus invalidate_buffers in general usage is not allwowed to trash
430 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
431 be preserved. These buffers are simply skipped.
433 We also skip buffers which are still in use. For example this can
434 happen if a userspace program is reading the block device.
436 NOTE: In the case where the user removed a removable-media-disk even if
437 there's still dirty data not synced on disk (due a bug in the device driver
438 or due an error of the user), by not destroying the dirty buffers we could
439 generate corruption also on the next media inserted, thus a parameter is
440 necessary to handle this case in the most safe way possible (trying
441 to not corrupt also the new disk inserted with the data belonging to
442 the old now corrupted disk). Also for the ramdisk the natural thing
443 to do in order to release the ramdisk memory is to destroy dirty buffers.
445 These are two special cases. Normal usage imply the device driver
446 to issue a sync on the device (without waiting I/O completion) and
447 then an invalidate_buffers call that doesn't trash dirty buffers.
449 For handling cache coherency with the blkdev pagecache the 'update' case
450 is been introduced. It is needed to re-read from disk any pinned
451 buffer. NOTE: re-reading from disk is destructive so we can do it only
452 when we assume nobody is changing the buffercache under our I/O and when
453 we think the disk contains more recent information than the buffercache.
454 The update == 1 pass marks the buffers we need to update, the update == 2
455 pass does the actual I/O. */
456 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
458 invalidate_bh_lrus();
460 * FIXME: what about destroy_dirty_buffers?
461 * We really want to use invalidate_inode_pages2() for
462 * that, but not until that's cleaned up.
464 invalidate_inode_pages(bdev->bd_inode->i_mapping);
468 * Kick pdflush then try to free up some ZONE_NORMAL memory.
470 static void free_more_memory(void)
472 struct zone *zone;
473 pg_data_t *pgdat;
475 wakeup_bdflush(1024);
476 blk_run_queues();
477 yield();
479 for_each_pgdat(pgdat) {
480 zone = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
481 if (zone)
482 try_to_free_pages(zone, GFP_NOFS, 0);
487 * I/O completion handler for block_read_full_page() - pages
488 * which come unlocked at the end of I/O.
490 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
492 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
493 unsigned long flags;
494 struct buffer_head *tmp;
495 struct page *page;
496 int page_uptodate = 1;
498 BUG_ON(!buffer_async_read(bh));
500 page = bh->b_page;
501 if (uptodate) {
502 set_buffer_uptodate(bh);
503 } else {
504 clear_buffer_uptodate(bh);
505 buffer_io_error(bh);
506 SetPageError(page);
510 * Be _very_ careful from here on. Bad things can happen if
511 * two buffer heads end IO at almost the same time and both
512 * decide that the page is now completely done.
514 spin_lock_irqsave(&page_uptodate_lock, flags);
515 clear_buffer_async_read(bh);
516 unlock_buffer(bh);
517 tmp = bh;
518 do {
519 if (!buffer_uptodate(tmp))
520 page_uptodate = 0;
521 if (buffer_async_read(tmp)) {
522 BUG_ON(!buffer_locked(tmp));
523 goto still_busy;
525 tmp = tmp->b_this_page;
526 } while (tmp != bh);
527 spin_unlock_irqrestore(&page_uptodate_lock, flags);
530 * If none of the buffers had errors and they are all
531 * uptodate then we can set the page uptodate.
533 if (page_uptodate && !PageError(page))
534 SetPageUptodate(page);
535 unlock_page(page);
536 return;
538 still_busy:
539 spin_unlock_irqrestore(&page_uptodate_lock, flags);
540 return;
544 * Completion handler for block_write_full_page() - pages which are unlocked
545 * during I/O, and which have PageWriteback cleared upon I/O completion.
547 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
549 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
550 unsigned long flags;
551 struct buffer_head *tmp;
552 struct page *page;
554 BUG_ON(!buffer_async_write(bh));
556 page = bh->b_page;
557 if (uptodate) {
558 set_buffer_uptodate(bh);
559 } else {
560 buffer_io_error(bh);
561 clear_buffer_uptodate(bh);
562 SetPageError(page);
565 spin_lock_irqsave(&page_uptodate_lock, flags);
566 clear_buffer_async_write(bh);
567 unlock_buffer(bh);
568 tmp = bh->b_this_page;
569 while (tmp != bh) {
570 if (buffer_async_write(tmp)) {
571 BUG_ON(!buffer_locked(tmp));
572 goto still_busy;
574 tmp = tmp->b_this_page;
576 spin_unlock_irqrestore(&page_uptodate_lock, flags);
577 end_page_writeback(page);
578 return;
580 still_busy:
581 spin_unlock_irqrestore(&page_uptodate_lock, flags);
582 return;
586 * If a page's buffers are under async readin (end_buffer_async_read
587 * completion) then there is a possibility that another thread of
588 * control could lock one of the buffers after it has completed
589 * but while some of the other buffers have not completed. This
590 * locked buffer would confuse end_buffer_async_read() into not unlocking
591 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
592 * that this buffer is not under async I/O.
594 * The page comes unlocked when it has no locked buffer_async buffers
595 * left.
597 * PageLocked prevents anyone starting new async I/O reads any of
598 * the buffers.
600 * PageWriteback is used to prevent simultaneous writeout of the same
601 * page.
603 * PageLocked prevents anyone from starting writeback of a page which is
604 * under read I/O (PageWriteback is only ever set against a locked page).
606 void mark_buffer_async_read(struct buffer_head *bh)
608 bh->b_end_io = end_buffer_async_read;
609 set_buffer_async_read(bh);
611 EXPORT_SYMBOL(mark_buffer_async_read);
613 void mark_buffer_async_write(struct buffer_head *bh)
615 bh->b_end_io = end_buffer_async_write;
616 set_buffer_async_write(bh);
618 EXPORT_SYMBOL(mark_buffer_async_write);
622 * fs/buffer.c contains helper functions for buffer-backed address space's
623 * fsync functions. A common requirement for buffer-based filesystems is
624 * that certain data from the backing blockdev needs to be written out for
625 * a successful fsync(). For example, ext2 indirect blocks need to be
626 * written back and waited upon before fsync() returns.
628 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
629 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
630 * management of a list of dependent buffers at ->i_mapping->private_list.
632 * Locking is a little subtle: try_to_free_buffers() will remove buffers
633 * from their controlling inode's queue when they are being freed. But
634 * try_to_free_buffers() will be operating against the *blockdev* mapping
635 * at the time, not against the S_ISREG file which depends on those buffers.
636 * So the locking for private_list is via the private_lock in the address_space
637 * which backs the buffers. Which is different from the address_space
638 * against which the buffers are listed. So for a particular address_space,
639 * mapping->private_lock does *not* protect mapping->private_list! In fact,
640 * mapping->private_list will always be protected by the backing blockdev's
641 * ->private_lock.
643 * Which introduces a requirement: all buffers on an address_space's
644 * ->private_list must be from the same address_space: the blockdev's.
646 * address_spaces which do not place buffers at ->private_list via these
647 * utility functions are free to use private_lock and private_list for
648 * whatever they want. The only requirement is that list_empty(private_list)
649 * be true at clear_inode() time.
651 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
652 * filesystems should do that. invalidate_inode_buffers() should just go
653 * BUG_ON(!list_empty).
655 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
656 * take an address_space, not an inode. And it should be called
657 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
658 * queued up.
660 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
661 * list if it is already on a list. Because if the buffer is on a list,
662 * it *must* already be on the right one. If not, the filesystem is being
663 * silly. This will save a ton of locking. But first we have to ensure
664 * that buffers are taken *off* the old inode's list when they are freed
665 * (presumably in truncate). That requires careful auditing of all
666 * filesystems (do it inside bforget()). It could also be done by bringing
667 * b_inode back.
670 void buffer_insert_list(spinlock_t *lock,
671 struct buffer_head *bh, struct list_head *list)
673 spin_lock(lock);
674 list_move_tail(&bh->b_assoc_buffers, list);
675 spin_unlock(lock);
679 * The buffer's backing address_space's private_lock must be held
681 static inline void __remove_assoc_queue(struct buffer_head *bh)
683 list_del_init(&bh->b_assoc_buffers);
686 int inode_has_buffers(struct inode *inode)
688 return !list_empty(&inode->i_data.private_list);
692 * osync is designed to support O_SYNC io. It waits synchronously for
693 * all already-submitted IO to complete, but does not queue any new
694 * writes to the disk.
696 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
697 * you dirty the buffers, and then use osync_inode_buffers to wait for
698 * completion. Any other dirty buffers which are not yet queued for
699 * write will not be flushed to disk by the osync.
701 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
703 struct buffer_head *bh;
704 struct list_head *p;
705 int err = 0;
707 spin_lock(lock);
708 repeat:
709 list_for_each_prev(p, list) {
710 bh = BH_ENTRY(p);
711 if (buffer_locked(bh)) {
712 get_bh(bh);
713 spin_unlock(lock);
714 wait_on_buffer(bh);
715 if (!buffer_uptodate(bh))
716 err = -EIO;
717 brelse(bh);
718 spin_lock(lock);
719 goto repeat;
722 spin_unlock(lock);
723 return err;
727 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
728 * buffers
729 * @buffer_mapping - the mapping which backs the buffers' data
730 * @mapping - the mapping which wants those buffers written
732 * Starts I/O against the buffers at mapping->private_list, and waits upon
733 * that I/O.
735 * Basically, this is a convenience function for fsync(). @buffer_mapping is
736 * the blockdev which "owns" the buffers and @mapping is a file or directory
737 * which needs those buffers to be written for a successful fsync().
739 int sync_mapping_buffers(struct address_space *mapping)
741 struct address_space *buffer_mapping = mapping->assoc_mapping;
743 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
744 return 0;
746 return fsync_buffers_list(&buffer_mapping->private_lock,
747 &mapping->private_list);
749 EXPORT_SYMBOL(sync_mapping_buffers);
752 * Called when we've recently written block `bblock', and it is known that
753 * `bblock' was for a buffer_boundary() buffer. This means that the block at
754 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
755 * dirty, schedule it for IO. So that indirects merge nicely with their data.
757 void write_boundary_block(struct block_device *bdev,
758 sector_t bblock, unsigned blocksize)
760 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
761 if (bh) {
762 if (buffer_dirty(bh))
763 ll_rw_block(WRITE, 1, &bh);
764 put_bh(bh);
768 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
770 struct address_space *mapping = inode->i_mapping;
771 struct address_space *buffer_mapping = bh->b_page->mapping;
773 mark_buffer_dirty(bh);
774 if (!mapping->assoc_mapping) {
775 mapping->assoc_mapping = buffer_mapping;
776 } else {
777 if (mapping->assoc_mapping != buffer_mapping)
778 BUG();
780 if (list_empty(&bh->b_assoc_buffers))
781 buffer_insert_list(&buffer_mapping->private_lock,
782 bh, &mapping->private_list);
784 EXPORT_SYMBOL(mark_buffer_dirty_inode);
787 * Add a page to the dirty page list.
789 * It is a sad fact of life that this function is called from several places
790 * deeply under spinlocking. It may not sleep.
792 * If the page has buffers, the uptodate buffers are set dirty, to preserve
793 * dirty-state coherency between the page and the buffers. It the page does
794 * not have buffers then when they are later attached they will all be set
795 * dirty.
797 * The buffers are dirtied before the page is dirtied. There's a small race
798 * window in which a writepage caller may see the page cleanness but not the
799 * buffer dirtiness. That's fine. If this code were to set the page dirty
800 * before the buffers, a concurrent writepage caller could clear the page dirty
801 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
802 * page on the dirty page list.
804 * There is also a small window where the page is dirty, and not on dirty_pages.
805 * Also a possibility that by the time the page is added to dirty_pages, it has
806 * been set clean. The page lists are somewhat approximate in this regard.
807 * It's better to have clean pages accidentally attached to dirty_pages than to
808 * leave dirty pages attached to clean_pages.
810 * We use private_lock to lock against try_to_free_buffers while using the
811 * page's buffer list. Also use this to protect against clean buffers being
812 * added to the page after it was set dirty.
814 * FIXME: may need to call ->reservepage here as well. That's rather up to the
815 * address_space though.
817 * For now, we treat swapper_space specially. It doesn't use the normal
818 * block a_ops.
820 int __set_page_dirty_buffers(struct page *page)
822 struct address_space * const mapping = page->mapping;
823 int ret = 0;
825 if (mapping == NULL) {
826 SetPageDirty(page);
827 goto out;
830 spin_lock(&mapping->private_lock);
831 if (page_has_buffers(page)) {
832 struct buffer_head *head = page_buffers(page);
833 struct buffer_head *bh = head;
835 do {
836 if (buffer_uptodate(bh))
837 set_buffer_dirty(bh);
838 else
839 buffer_error();
840 bh = bh->b_this_page;
841 } while (bh != head);
843 spin_unlock(&mapping->private_lock);
845 if (!TestSetPageDirty(page)) {
846 spin_lock(&mapping->page_lock);
847 if (page->mapping) { /* Race with truncate? */
848 if (!mapping->backing_dev_info->memory_backed)
849 inc_page_state(nr_dirty);
850 list_del(&page->list);
851 list_add(&page->list, &mapping->dirty_pages);
853 spin_unlock(&mapping->page_lock);
854 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
857 out:
858 return ret;
860 EXPORT_SYMBOL(__set_page_dirty_buffers);
863 * Write out and wait upon a list of buffers.
865 * We have conflicting pressures: we want to make sure that all
866 * initially dirty buffers get waited on, but that any subsequently
867 * dirtied buffers don't. After all, we don't want fsync to last
868 * forever if somebody is actively writing to the file.
870 * Do this in two main stages: first we copy dirty buffers to a
871 * temporary inode list, queueing the writes as we go. Then we clean
872 * up, waiting for those writes to complete.
874 * During this second stage, any subsequent updates to the file may end
875 * up refiling the buffer on the original inode's dirty list again, so
876 * there is a chance we will end up with a buffer queued for write but
877 * not yet completed on that list. So, as a final cleanup we go through
878 * the osync code to catch these locked, dirty buffers without requeuing
879 * any newly dirty buffers for write.
881 int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
883 struct buffer_head *bh;
884 struct list_head tmp;
885 int err = 0, err2;
887 INIT_LIST_HEAD(&tmp);
889 spin_lock(lock);
890 while (!list_empty(list)) {
891 bh = BH_ENTRY(list->next);
892 list_del_init(&bh->b_assoc_buffers);
893 if (buffer_dirty(bh) || buffer_locked(bh)) {
894 list_add(&bh->b_assoc_buffers, &tmp);
895 if (buffer_dirty(bh)) {
896 get_bh(bh);
897 spin_unlock(lock);
899 * Ensure any pending I/O completes so that
900 * ll_rw_block() actually writes the current
901 * contents - it is a noop if I/O is still in
902 * flight on potentially older contents.
904 wait_on_buffer(bh);
905 ll_rw_block(WRITE, 1, &bh);
906 brelse(bh);
907 spin_lock(lock);
912 while (!list_empty(&tmp)) {
913 bh = BH_ENTRY(tmp.prev);
914 __remove_assoc_queue(bh);
915 get_bh(bh);
916 spin_unlock(lock);
917 wait_on_buffer(bh);
918 if (!buffer_uptodate(bh))
919 err = -EIO;
920 brelse(bh);
921 spin_lock(lock);
924 spin_unlock(lock);
925 err2 = osync_buffers_list(lock, list);
926 if (err)
927 return err;
928 else
929 return err2;
933 * Invalidate any and all dirty buffers on a given inode. We are
934 * probably unmounting the fs, but that doesn't mean we have already
935 * done a sync(). Just drop the buffers from the inode list.
937 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
938 * assumes that all the buffers are against the blockdev. Not true
939 * for reiserfs.
941 void invalidate_inode_buffers(struct inode *inode)
943 if (inode_has_buffers(inode)) {
944 struct address_space *mapping = &inode->i_data;
945 struct list_head *list = &mapping->private_list;
946 struct address_space *buffer_mapping = mapping->assoc_mapping;
948 spin_lock(&buffer_mapping->private_lock);
949 while (!list_empty(list))
950 __remove_assoc_queue(BH_ENTRY(list->next));
951 spin_unlock(&buffer_mapping->private_lock);
956 * Remove any clean buffers from the inode's buffer list. This is called
957 * when we're trying to free the inode itself. Those buffers can pin it.
959 * Returns true if all buffers were removed.
961 int remove_inode_buffers(struct inode *inode)
963 int ret = 1;
965 if (inode_has_buffers(inode)) {
966 struct address_space *mapping = &inode->i_data;
967 struct list_head *list = &mapping->private_list;
968 struct address_space *buffer_mapping = mapping->assoc_mapping;
970 spin_lock(&buffer_mapping->private_lock);
971 while (!list_empty(list)) {
972 struct buffer_head *bh = BH_ENTRY(list->next);
973 if (buffer_dirty(bh)) {
974 ret = 0;
975 break;
977 __remove_assoc_queue(bh);
979 spin_unlock(&buffer_mapping->private_lock);
981 return ret;
985 * Create the appropriate buffers when given a page for data area and
986 * the size of each buffer.. Use the bh->b_this_page linked list to
987 * follow the buffers created. Return NULL if unable to create more
988 * buffers.
990 * The retry flag is used to differentiate async IO (paging, swapping)
991 * which may not fail from ordinary buffer allocations.
993 static struct buffer_head *
994 create_buffers(struct page * page, unsigned long size, int retry)
996 struct buffer_head *bh, *head;
997 long offset;
999 try_again:
1000 head = NULL;
1001 offset = PAGE_SIZE;
1002 while ((offset -= size) >= 0) {
1003 bh = alloc_buffer_head(GFP_NOFS);
1004 if (!bh)
1005 goto no_grow;
1007 bh->b_bdev = NULL;
1008 bh->b_this_page = head;
1009 bh->b_blocknr = -1;
1010 head = bh;
1012 bh->b_state = 0;
1013 atomic_set(&bh->b_count, 0);
1014 bh->b_size = size;
1016 /* Link the buffer to its page */
1017 set_bh_page(bh, page, offset);
1019 bh->b_end_io = NULL;
1021 return head;
1023 * In case anything failed, we just free everything we got.
1025 no_grow:
1026 if (head) {
1027 do {
1028 bh = head;
1029 head = head->b_this_page;
1030 free_buffer_head(bh);
1031 } while (head);
1035 * Return failure for non-async IO requests. Async IO requests
1036 * are not allowed to fail, so we have to wait until buffer heads
1037 * become available. But we don't want tasks sleeping with
1038 * partially complete buffers, so all were released above.
1040 if (!retry)
1041 return NULL;
1043 /* We're _really_ low on memory. Now we just
1044 * wait for old buffer heads to become free due to
1045 * finishing IO. Since this is an async request and
1046 * the reserve list is empty, we're sure there are
1047 * async buffer heads in use.
1049 free_more_memory();
1050 goto try_again;
1053 static inline void
1054 link_dev_buffers(struct page *page, struct buffer_head *head)
1056 struct buffer_head *bh, *tail;
1058 bh = head;
1059 do {
1060 tail = bh;
1061 bh = bh->b_this_page;
1062 } while (bh);
1063 tail->b_this_page = head;
1064 __set_page_buffers(page, head);
1068 * Initialise the state of a blockdev page's buffers.
1070 static void
1071 init_page_buffers(struct page *page, struct block_device *bdev,
1072 int block, int size)
1074 struct buffer_head *head = page_buffers(page);
1075 struct buffer_head *bh = head;
1076 unsigned int b_state;
1078 b_state = 1 << BH_Mapped;
1079 if (PageUptodate(page))
1080 b_state |= 1 << BH_Uptodate;
1082 do {
1083 if (!(bh->b_state & (1 << BH_Mapped))) {
1084 init_buffer(bh, NULL, NULL);
1085 bh->b_bdev = bdev;
1086 bh->b_blocknr = block;
1087 bh->b_state = b_state;
1089 block++;
1090 bh = bh->b_this_page;
1091 } while (bh != head);
1095 * Create the page-cache page that contains the requested block.
1097 * This is user purely for blockdev mappings.
1099 static struct page *
1100 grow_dev_page(struct block_device *bdev, unsigned long block,
1101 unsigned long index, int size)
1103 struct inode *inode = bdev->bd_inode;
1104 struct page *page;
1105 struct buffer_head *bh;
1107 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1108 if (!page)
1109 return NULL;
1111 if (!PageLocked(page))
1112 BUG();
1114 if (page_has_buffers(page)) {
1115 bh = page_buffers(page);
1116 if (bh->b_size == size)
1117 return page;
1118 if (!try_to_free_buffers(page))
1119 goto failed;
1123 * Allocate some buffers for this page
1125 bh = create_buffers(page, size, 0);
1126 if (!bh)
1127 goto failed;
1130 * Link the page to the buffers and initialise them. Take the
1131 * lock to be atomic wrt __find_get_block(), which does not
1132 * run under the page lock.
1134 spin_lock(&inode->i_mapping->private_lock);
1135 link_dev_buffers(page, bh);
1136 init_page_buffers(page, bdev, block, size);
1137 spin_unlock(&inode->i_mapping->private_lock);
1138 return page;
1140 failed:
1141 buffer_error();
1142 unlock_page(page);
1143 page_cache_release(page);
1144 return NULL;
1148 * Create buffers for the specified block device block's page. If
1149 * that page was dirty, the buffers are set dirty also.
1151 * Except that's a bug. Attaching dirty buffers to a dirty
1152 * blockdev's page can result in filesystem corruption, because
1153 * some of those buffers may be aliases of filesystem data.
1154 * grow_dev_page() will go BUG() if this happens.
1156 static inline int
1157 grow_buffers(struct block_device *bdev, unsigned long block, int size)
1159 struct page *page;
1160 unsigned long index;
1161 int sizebits;
1163 /* Size must be multiple of hard sectorsize */
1164 if (size & (bdev_hardsect_size(bdev)-1))
1165 BUG();
1166 if (size < 512 || size > PAGE_SIZE)
1167 BUG();
1169 sizebits = -1;
1170 do {
1171 sizebits++;
1172 } while ((size << sizebits) < PAGE_SIZE);
1174 index = block >> sizebits;
1175 block = index << sizebits;
1177 /* Create a page with the proper size buffers.. */
1178 page = grow_dev_page(bdev, block, index, size);
1179 if (!page)
1180 return 0;
1181 unlock_page(page);
1182 page_cache_release(page);
1183 return 1;
1186 struct buffer_head *
1187 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1189 for (;;) {
1190 struct buffer_head * bh;
1192 bh = __find_get_block(bdev, block, size);
1193 if (bh)
1194 return bh;
1196 if (!grow_buffers(bdev, block, size))
1197 free_more_memory();
1202 * The relationship between dirty buffers and dirty pages:
1204 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1205 * the page appears on its address_space.dirty_pages list.
1207 * At all times, the dirtiness of the buffers represents the dirtiness of
1208 * subsections of the page. If the page has buffers, the page dirty bit is
1209 * merely a hint about the true dirty state.
1211 * When a page is set dirty in its entirety, all its buffers are marked dirty
1212 * (if the page has buffers).
1214 * When a buffer is marked dirty, its page is dirtied, but the page's other
1215 * buffers are not.
1217 * Also. When blockdev buffers are explicitly read with bread(), they
1218 * individually become uptodate. But their backing page remains not
1219 * uptodate - even if all of its buffers are uptodate. A subsequent
1220 * block_read_full_page() against that page will discover all the uptodate
1221 * buffers, will set the page uptodate and will perform no I/O.
1225 * mark_buffer_dirty - mark a buffer_head as needing writeout
1227 * mark_buffer_dirty() will set the dirty bit against the buffer,
1228 * then set its backing page dirty, then attach the page to its
1229 * address_space's dirty_pages list and then attach the address_space's
1230 * inode to its superblock's dirty inode list.
1232 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1233 * mapping->page_lock and the global inode_lock.
1235 void mark_buffer_dirty(struct buffer_head *bh)
1237 if (!buffer_uptodate(bh))
1238 buffer_error();
1239 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1240 __set_page_dirty_nobuffers(bh->b_page);
1244 * Decrement a buffer_head's reference count. If all buffers against a page
1245 * have zero reference count, are clean and unlocked, and if the page is clean
1246 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1247 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1248 * a page but it ends up not being freed, and buffers may later be reattached).
1250 void __brelse(struct buffer_head * buf)
1252 if (atomic_read(&buf->b_count)) {
1253 put_bh(buf);
1254 return;
1256 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1257 buffer_error(); /* For the stack backtrace */
1261 * bforget() is like brelse(), except it discards any
1262 * potentially dirty data.
1264 void __bforget(struct buffer_head *bh)
1266 clear_buffer_dirty(bh);
1267 if (!list_empty(&bh->b_assoc_buffers)) {
1268 struct address_space *buffer_mapping = bh->b_page->mapping;
1270 spin_lock(&buffer_mapping->private_lock);
1271 list_del_init(&bh->b_assoc_buffers);
1272 spin_unlock(&buffer_mapping->private_lock);
1274 __brelse(bh);
1277 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1279 lock_buffer(bh);
1280 if (buffer_uptodate(bh)) {
1281 unlock_buffer(bh);
1282 return bh;
1283 } else {
1284 if (buffer_dirty(bh))
1285 buffer_error();
1286 get_bh(bh);
1287 bh->b_end_io = end_buffer_io_sync;
1288 submit_bh(READ, bh);
1289 wait_on_buffer(bh);
1290 if (buffer_uptodate(bh))
1291 return bh;
1293 brelse(bh);
1294 return NULL;
1298 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1299 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1300 * refcount elevated by one when they're in an LRU. A buffer can only appear
1301 * once in a particular CPU's LRU. A single buffer can be present in multiple
1302 * CPU's LRUs at the same time.
1304 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1305 * sb_find_get_block().
1307 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1308 * a local interrupt disable for that.
1311 #define BH_LRU_SIZE 8
1313 struct bh_lru {
1314 struct buffer_head *bhs[BH_LRU_SIZE];
1317 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}};
1319 #ifdef CONFIG_SMP
1320 #define bh_lru_lock() local_irq_disable()
1321 #define bh_lru_unlock() local_irq_enable()
1322 #else
1323 #define bh_lru_lock() preempt_disable()
1324 #define bh_lru_unlock() preempt_enable()
1325 #endif
1327 static inline void check_irqs_on(void)
1329 #ifdef irqs_disabled
1330 BUG_ON(irqs_disabled());
1331 #endif
1335 * The LRU management algorithm is dopey-but-simple. Sorry.
1337 static void bh_lru_install(struct buffer_head *bh)
1339 struct buffer_head *evictee = NULL;
1340 struct bh_lru *lru;
1342 check_irqs_on();
1343 bh_lru_lock();
1344 lru = &__get_cpu_var(bh_lrus);
1345 if (lru->bhs[0] != bh) {
1346 struct buffer_head *bhs[BH_LRU_SIZE];
1347 int in;
1348 int out = 0;
1350 get_bh(bh);
1351 bhs[out++] = bh;
1352 for (in = 0; in < BH_LRU_SIZE; in++) {
1353 struct buffer_head *bh2 = lru->bhs[in];
1355 if (bh2 == bh) {
1356 __brelse(bh2);
1357 } else {
1358 if (out >= BH_LRU_SIZE) {
1359 BUG_ON(evictee != NULL);
1360 evictee = bh2;
1361 } else {
1362 bhs[out++] = bh2;
1366 while (out < BH_LRU_SIZE)
1367 bhs[out++] = NULL;
1368 memcpy(lru->bhs, bhs, sizeof(bhs));
1370 bh_lru_unlock();
1372 if (evictee)
1373 __brelse(evictee);
1377 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1379 static inline struct buffer_head *
1380 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1382 struct buffer_head *ret = NULL;
1383 struct bh_lru *lru;
1384 int i;
1386 check_irqs_on();
1387 bh_lru_lock();
1388 lru = &__get_cpu_var(bh_lrus);
1389 for (i = 0; i < BH_LRU_SIZE; i++) {
1390 struct buffer_head *bh = lru->bhs[i];
1392 if (bh && bh->b_bdev == bdev &&
1393 bh->b_blocknr == block && bh->b_size == size) {
1394 if (i) {
1395 while (i) {
1396 lru->bhs[i] = lru->bhs[i - 1];
1397 i--;
1399 lru->bhs[0] = bh;
1401 get_bh(bh);
1402 ret = bh;
1403 break;
1406 bh_lru_unlock();
1407 return ret;
1411 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1412 * it in the LRU and mark it as accessed. If it is not present then return
1413 * NULL
1415 struct buffer_head *
1416 __find_get_block(struct block_device *bdev, sector_t block, int size)
1418 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1420 if (bh == NULL) {
1421 bh = __find_get_block_slow(bdev, block, size);
1422 if (bh)
1423 bh_lru_install(bh);
1425 if (bh)
1426 touch_buffer(bh);
1427 return bh;
1429 EXPORT_SYMBOL(__find_get_block);
1432 * __getblk will locate (and, if necessary, create) the buffer_head
1433 * which corresponds to the passed block_device, block and size. The
1434 * returned buffer has its reference count incremented.
1436 * __getblk() cannot fail - it just keeps trying. If you pass it an
1437 * illegal block number, __getblk() will happily return a buffer_head
1438 * which represents the non-existent block. Very weird.
1440 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1441 * attempt is failing. FIXME, perhaps?
1443 struct buffer_head *
1444 __getblk(struct block_device *bdev, sector_t block, int size)
1446 struct buffer_head *bh = __find_get_block(bdev, block, size);
1448 if (bh == NULL)
1449 bh = __getblk_slow(bdev, block, size);
1450 return bh;
1452 EXPORT_SYMBOL(__getblk);
1455 * Do async read-ahead on a buffer..
1457 void __breadahead(struct block_device *bdev, sector_t block, int size)
1459 struct buffer_head *bh = __getblk(bdev, block, size);
1460 ll_rw_block(READA, 1, &bh);
1461 brelse(bh);
1463 EXPORT_SYMBOL(__breadahead);
1466 * __bread() - reads a specified block and returns the bh
1467 * @block: number of block
1468 * @size: size (in bytes) to read
1470 * Reads a specified block, and returns buffer head that contains it.
1471 * It returns NULL if the block was unreadable.
1473 struct buffer_head *
1474 __bread(struct block_device *bdev, sector_t block, int size)
1476 struct buffer_head *bh = __getblk(bdev, block, size);
1478 if (!buffer_uptodate(bh))
1479 bh = __bread_slow(bh);
1480 return bh;
1482 EXPORT_SYMBOL(__bread);
1485 * invalidate_bh_lrus() is called rarely - at unmount. Because it is only for
1486 * unmount it only needs to ensure that all buffers from the target device are
1487 * invalidated on return and it doesn't need to worry about new buffers from
1488 * that device being added - the unmount code has to prevent that.
1490 static void invalidate_bh_lru(void *arg)
1492 struct bh_lru *b = &get_cpu_var(bh_lrus);
1493 int i;
1495 for (i = 0; i < BH_LRU_SIZE; i++) {
1496 brelse(b->bhs[i]);
1497 b->bhs[i] = NULL;
1499 put_cpu_var(bh_lrus);
1502 static void invalidate_bh_lrus(void)
1504 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1507 void set_bh_page(struct buffer_head *bh,
1508 struct page *page, unsigned long offset)
1510 bh->b_page = page;
1511 if (offset >= PAGE_SIZE)
1512 BUG();
1513 if (PageHighMem(page))
1515 * This catches illegal uses and preserves the offset:
1517 bh->b_data = (char *)(0 + offset);
1518 else
1519 bh->b_data = page_address(page) + offset;
1521 EXPORT_SYMBOL(set_bh_page);
1524 * Called when truncating a buffer on a page completely.
1526 static inline void discard_buffer(struct buffer_head * bh)
1528 lock_buffer(bh);
1529 clear_buffer_dirty(bh);
1530 bh->b_bdev = NULL;
1531 clear_buffer_mapped(bh);
1532 clear_buffer_req(bh);
1533 clear_buffer_new(bh);
1534 clear_buffer_delay(bh);
1535 unlock_buffer(bh);
1539 * try_to_release_page() - release old fs-specific metadata on a page
1541 * @page: the page which the kernel is trying to free
1542 * @gfp_mask: memory allocation flags (and I/O mode)
1544 * The address_space is to try to release any data against the page
1545 * (presumably at page->private). If the release was successful, return `1'.
1546 * Otherwise return zero.
1548 * The @gfp_mask argument specifies whether I/O may be performed to release
1549 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1551 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1553 int try_to_release_page(struct page *page, int gfp_mask)
1555 struct address_space * const mapping = page->mapping;
1557 if (!PageLocked(page))
1558 BUG();
1559 if (PageWriteback(page))
1560 return 0;
1562 if (mapping && mapping->a_ops->releasepage)
1563 return mapping->a_ops->releasepage(page, gfp_mask);
1564 return try_to_free_buffers(page);
1568 * block_invalidatepage - invalidate part of all of a buffer-backed page
1570 * @page: the page which is affected
1571 * @offset: the index of the truncation point
1573 * block_invalidatepage() is called when all or part of the page has become
1574 * invalidatedby a truncate operation.
1576 * block_invalidatepage() does not have to release all buffers, but it must
1577 * ensure that no dirty buffer is left outside @offset and that no I/O
1578 * is underway against any of the blocks which are outside the truncation
1579 * point. Because the caller is about to free (and possibly reuse) those
1580 * blocks on-disk.
1582 int block_invalidatepage(struct page *page, unsigned long offset)
1584 struct buffer_head *head, *bh, *next;
1585 unsigned int curr_off = 0;
1586 int ret = 1;
1588 BUG_ON(!PageLocked(page));
1589 if (!page_has_buffers(page))
1590 goto out;
1592 head = page_buffers(page);
1593 bh = head;
1594 do {
1595 unsigned int next_off = curr_off + bh->b_size;
1596 next = bh->b_this_page;
1599 * is this block fully invalidated?
1601 if (offset <= curr_off)
1602 discard_buffer(bh);
1603 curr_off = next_off;
1604 bh = next;
1605 } while (bh != head);
1608 * We release buffers only if the entire page is being invalidated.
1609 * The get_block cached value has been unconditionally invalidated,
1610 * so real IO is not possible anymore.
1612 if (offset == 0)
1613 ret = try_to_release_page(page, 0);
1614 out:
1615 return ret;
1617 EXPORT_SYMBOL(block_invalidatepage);
1620 * We attach and possibly dirty the buffers atomically wrt
1621 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1622 * is already excluded via the page lock.
1624 void create_empty_buffers(struct page *page,
1625 unsigned long blocksize, unsigned long b_state)
1627 struct buffer_head *bh, *head, *tail;
1629 head = create_buffers(page, blocksize, 1);
1630 bh = head;
1631 do {
1632 bh->b_state |= b_state;
1633 tail = bh;
1634 bh = bh->b_this_page;
1635 } while (bh);
1636 tail->b_this_page = head;
1638 spin_lock(&page->mapping->private_lock);
1639 if (PageUptodate(page) || PageDirty(page)) {
1640 bh = head;
1641 do {
1642 if (PageDirty(page))
1643 set_buffer_dirty(bh);
1644 if (PageUptodate(page))
1645 set_buffer_uptodate(bh);
1646 bh = bh->b_this_page;
1647 } while (bh != head);
1649 __set_page_buffers(page, head);
1650 spin_unlock(&page->mapping->private_lock);
1652 EXPORT_SYMBOL(create_empty_buffers);
1655 * We are taking a block for data and we don't want any output from any
1656 * buffer-cache aliases starting from return from that function and
1657 * until the moment when something will explicitly mark the buffer
1658 * dirty (hopefully that will not happen until we will free that block ;-)
1659 * We don't even need to mark it not-uptodate - nobody can expect
1660 * anything from a newly allocated buffer anyway. We used to used
1661 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1662 * don't want to mark the alias unmapped, for example - it would confuse
1663 * anyone who might pick it with bread() afterwards...
1665 * Also.. Note that bforget() doesn't lock the buffer. So there can
1666 * be writeout I/O going on against recently-freed buffers. We don't
1667 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1668 * only if we really need to. That happens here.
1670 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1672 struct buffer_head *old_bh;
1674 old_bh = __find_get_block_slow(bdev, block, 0);
1675 if (old_bh) {
1676 #if 0 /* This happens. Later. */
1677 if (buffer_dirty(old_bh))
1678 buffer_error();
1679 #endif
1680 clear_buffer_dirty(old_bh);
1681 wait_on_buffer(old_bh);
1682 clear_buffer_req(old_bh);
1683 __brelse(old_bh);
1686 EXPORT_SYMBOL(unmap_underlying_metadata);
1689 * NOTE! All mapped/uptodate combinations are valid:
1691 * Mapped Uptodate Meaning
1693 * No No "unknown" - must do get_block()
1694 * No Yes "hole" - zero-filled
1695 * Yes No "allocated" - allocated on disk, not read in
1696 * Yes Yes "valid" - allocated and up-to-date in memory.
1698 * "Dirty" is valid only with the last case (mapped+uptodate).
1702 * While block_write_full_page is writing back the dirty buffers under
1703 * the page lock, whoever dirtied the buffers may decide to clean them
1704 * again at any time. We handle that by only looking at the buffer
1705 * state inside lock_buffer().
1707 * If block_write_full_page() is called for regular writeback
1708 * (called_for_sync() is false) then it will redirty a page which has a locked
1709 * buffer. This only can happen if someone has written the buffer directly,
1710 * with submit_bh(). At the address_space level PageWriteback prevents this
1711 * contention from occurring.
1713 static int __block_write_full_page(struct inode *inode, struct page *page,
1714 get_block_t *get_block, struct writeback_control *wbc)
1716 int err;
1717 unsigned long block;
1718 unsigned long last_block;
1719 struct buffer_head *bh, *head;
1720 int nr_underway = 0;
1722 BUG_ON(!PageLocked(page));
1724 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1726 if (!page_has_buffers(page)) {
1727 if (!PageUptodate(page))
1728 buffer_error();
1729 create_empty_buffers(page, 1 << inode->i_blkbits,
1730 (1 << BH_Dirty)|(1 << BH_Uptodate));
1734 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1735 * here, and the (potentially unmapped) buffers may become dirty at
1736 * any time. If a buffer becomes dirty here after we've inspected it
1737 * then we just miss that fact, and the page stays dirty.
1739 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1740 * handle that here by just cleaning them.
1743 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1744 head = page_buffers(page);
1745 bh = head;
1748 * Get all the dirty buffers mapped to disk addresses and
1749 * handle any aliases from the underlying blockdev's mapping.
1751 do {
1752 if (block > last_block) {
1754 * mapped buffers outside i_size will occur, because
1755 * this page can be outside i_size when there is a
1756 * truncate in progress.
1758 * if (buffer_mapped(bh))
1759 * buffer_error();
1762 * The buffer was zeroed by block_write_full_page()
1764 clear_buffer_dirty(bh);
1765 set_buffer_uptodate(bh);
1766 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1767 if (buffer_new(bh))
1768 buffer_error();
1769 err = get_block(inode, block, bh, 1);
1770 if (err)
1771 goto recover;
1772 if (buffer_new(bh)) {
1773 /* blockdev mappings never come here */
1774 clear_buffer_new(bh);
1775 unmap_underlying_metadata(bh->b_bdev,
1776 bh->b_blocknr);
1779 bh = bh->b_this_page;
1780 block++;
1781 } while (bh != head);
1783 do {
1784 get_bh(bh);
1785 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1786 if (wbc->sync_mode != WB_SYNC_NONE) {
1787 lock_buffer(bh);
1788 } else {
1789 if (test_set_buffer_locked(bh)) {
1790 __set_page_dirty_nobuffers(page);
1791 continue;
1794 if (test_clear_buffer_dirty(bh)) {
1795 if (!buffer_uptodate(bh))
1796 buffer_error();
1797 mark_buffer_async_write(bh);
1798 } else {
1799 unlock_buffer(bh);
1802 } while ((bh = bh->b_this_page) != head);
1804 BUG_ON(PageWriteback(page));
1805 SetPageWriteback(page); /* Keeps try_to_free_buffers() away */
1806 unlock_page(page);
1809 * The page may come unlocked any time after the *first* submit_bh()
1810 * call. Be careful with its buffers.
1812 do {
1813 struct buffer_head *next = bh->b_this_page;
1814 if (buffer_async_write(bh)) {
1815 submit_bh(WRITE, bh);
1816 nr_underway++;
1818 put_bh(bh);
1819 bh = next;
1820 } while (bh != head);
1822 err = 0;
1823 done:
1824 if (nr_underway == 0) {
1826 * The page was marked dirty, but the buffers were
1827 * clean. Someone wrote them back by hand with
1828 * ll_rw_block/submit_bh. A rare case.
1830 int uptodate = 1;
1831 do {
1832 if (!buffer_uptodate(bh)) {
1833 uptodate = 0;
1834 break;
1836 bh = bh->b_this_page;
1837 } while (bh != head);
1838 if (uptodate)
1839 SetPageUptodate(page);
1840 end_page_writeback(page);
1842 return err;
1844 recover:
1846 * ENOSPC, or some other error. We may already have added some
1847 * blocks to the file, so we need to write these out to avoid
1848 * exposing stale data.
1849 * The page is currently locked and not marked for writeback
1851 bh = head;
1852 /* Recovery: lock and submit the mapped buffers */
1853 do {
1854 get_bh(bh);
1855 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1856 lock_buffer(bh);
1857 mark_buffer_async_write(bh);
1858 } else {
1860 * The buffer may have been set dirty during
1861 * attachment to a dirty page.
1863 clear_buffer_dirty(bh);
1865 } while ((bh = bh->b_this_page) != head);
1866 SetPageError(page);
1867 BUG_ON(PageWriteback(page));
1868 SetPageWriteback(page);
1869 unlock_page(page);
1870 do {
1871 struct buffer_head *next = bh->b_this_page;
1872 if (buffer_async_write(bh)) {
1873 clear_buffer_dirty(bh);
1874 submit_bh(WRITE, bh);
1875 nr_underway++;
1877 put_bh(bh);
1878 bh = next;
1879 } while (bh != head);
1880 goto done;
1883 static int __block_prepare_write(struct inode *inode, struct page *page,
1884 unsigned from, unsigned to, get_block_t *get_block)
1886 unsigned block_start, block_end;
1887 sector_t block;
1888 int err = 0;
1889 unsigned blocksize, bbits;
1890 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1892 BUG_ON(!PageLocked(page));
1893 BUG_ON(from > PAGE_CACHE_SIZE);
1894 BUG_ON(to > PAGE_CACHE_SIZE);
1895 BUG_ON(from > to);
1897 blocksize = 1 << inode->i_blkbits;
1898 if (!page_has_buffers(page))
1899 create_empty_buffers(page, blocksize, 0);
1900 head = page_buffers(page);
1902 bbits = inode->i_blkbits;
1903 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1905 for(bh = head, block_start = 0; bh != head || !block_start;
1906 block++, block_start=block_end, bh = bh->b_this_page) {
1907 block_end = block_start + blocksize;
1908 if (block_end <= from || block_start >= to) {
1909 if (PageUptodate(page)) {
1910 if (!buffer_uptodate(bh))
1911 set_buffer_uptodate(bh);
1913 continue;
1915 if (buffer_new(bh))
1916 clear_buffer_new(bh);
1917 if (!buffer_mapped(bh)) {
1918 err = get_block(inode, block, bh, 1);
1919 if (err)
1920 goto out;
1921 if (buffer_new(bh)) {
1922 clear_buffer_new(bh);
1923 unmap_underlying_metadata(bh->b_bdev,
1924 bh->b_blocknr);
1925 if (PageUptodate(page)) {
1926 if (!buffer_mapped(bh))
1927 buffer_error();
1928 set_buffer_uptodate(bh);
1929 continue;
1931 if (block_end > to || block_start < from) {
1932 void *kaddr;
1934 kaddr = kmap_atomic(page, KM_USER0);
1935 if (block_end > to)
1936 memset(kaddr+to, 0,
1937 block_end-to);
1938 if (block_start < from)
1939 memset(kaddr+block_start,
1940 0, from-block_start);
1941 flush_dcache_page(page);
1942 kunmap_atomic(kaddr, KM_USER0);
1944 continue;
1947 if (PageUptodate(page)) {
1948 if (!buffer_uptodate(bh))
1949 set_buffer_uptodate(bh);
1950 continue;
1952 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1953 (block_start < from || block_end > to)) {
1954 ll_rw_block(READ, 1, &bh);
1955 *wait_bh++=bh;
1959 * If we issued read requests - let them complete.
1961 while(wait_bh > wait) {
1962 wait_on_buffer(*--wait_bh);
1963 if (!buffer_uptodate(*wait_bh))
1964 return -EIO;
1966 return 0;
1967 out:
1969 * Zero out any newly allocated blocks to avoid exposing stale
1970 * data. If BH_New is set, we know that the block was newly
1971 * allocated in the above loop.
1973 bh = head;
1974 block_start = 0;
1975 do {
1976 block_end = block_start+blocksize;
1977 if (block_end <= from)
1978 goto next_bh;
1979 if (block_start >= to)
1980 break;
1981 if (buffer_new(bh)) {
1982 void *kaddr;
1984 clear_buffer_new(bh);
1985 if (buffer_uptodate(bh))
1986 buffer_error();
1987 kaddr = kmap_atomic(page, KM_USER0);
1988 memset(kaddr+block_start, 0, bh->b_size);
1989 kunmap_atomic(kaddr, KM_USER0);
1990 set_buffer_uptodate(bh);
1991 mark_buffer_dirty(bh);
1993 next_bh:
1994 block_start = block_end;
1995 bh = bh->b_this_page;
1996 } while (bh != head);
1997 return err;
2000 static int __block_commit_write(struct inode *inode, struct page *page,
2001 unsigned from, unsigned to)
2003 unsigned block_start, block_end;
2004 int partial = 0;
2005 unsigned blocksize;
2006 struct buffer_head *bh, *head;
2008 blocksize = 1 << inode->i_blkbits;
2010 for(bh = head = page_buffers(page), block_start = 0;
2011 bh != head || !block_start;
2012 block_start=block_end, bh = bh->b_this_page) {
2013 block_end = block_start + blocksize;
2014 if (block_end <= from || block_start >= to) {
2015 if (!buffer_uptodate(bh))
2016 partial = 1;
2017 } else {
2018 set_buffer_uptodate(bh);
2019 mark_buffer_dirty(bh);
2024 * If this is a partial write which happened to make all buffers
2025 * uptodate then we can optimize away a bogus readpage() for
2026 * the next read(). Here we 'discover' whether the page went
2027 * uptodate as a result of this (potentially partial) write.
2029 if (!partial)
2030 SetPageUptodate(page);
2031 return 0;
2035 * Generic "read page" function for block devices that have the normal
2036 * get_block functionality. This is most of the block device filesystems.
2037 * Reads the page asynchronously --- the unlock_buffer() and
2038 * set/clear_buffer_uptodate() functions propagate buffer state into the
2039 * page struct once IO has completed.
2041 int block_read_full_page(struct page *page, get_block_t *get_block)
2043 struct inode *inode = page->mapping->host;
2044 sector_t iblock, lblock;
2045 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2046 unsigned int blocksize;
2047 int nr, i;
2048 int fully_mapped = 1;
2050 if (!PageLocked(page))
2051 PAGE_BUG(page);
2052 if (PageUptodate(page))
2053 buffer_error();
2054 blocksize = 1 << inode->i_blkbits;
2055 if (!page_has_buffers(page))
2056 create_empty_buffers(page, blocksize, 0);
2057 head = page_buffers(page);
2059 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2060 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2061 bh = head;
2062 nr = 0;
2063 i = 0;
2065 do {
2066 if (buffer_uptodate(bh))
2067 continue;
2069 if (!buffer_mapped(bh)) {
2070 fully_mapped = 0;
2071 if (iblock < lblock) {
2072 if (get_block(inode, iblock, bh, 0))
2073 SetPageError(page);
2075 if (!buffer_mapped(bh)) {
2076 void *kaddr = kmap_atomic(page, KM_USER0);
2077 memset(kaddr + i * blocksize, 0, blocksize);
2078 flush_dcache_page(page);
2079 kunmap_atomic(kaddr, KM_USER0);
2080 set_buffer_uptodate(bh);
2081 continue;
2084 * get_block() might have updated the buffer
2085 * synchronously
2087 if (buffer_uptodate(bh))
2088 continue;
2090 arr[nr++] = bh;
2091 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2093 if (fully_mapped)
2094 SetPageMappedToDisk(page);
2096 if (!nr) {
2098 * All buffers are uptodate - we can set the page uptodate
2099 * as well. But not if get_block() returned an error.
2101 if (!PageError(page))
2102 SetPageUptodate(page);
2103 unlock_page(page);
2104 return 0;
2107 /* Stage two: lock the buffers */
2108 for (i = 0; i < nr; i++) {
2109 bh = arr[i];
2110 lock_buffer(bh);
2111 mark_buffer_async_read(bh);
2115 * Stage 3: start the IO. Check for uptodateness
2116 * inside the buffer lock in case another process reading
2117 * the underlying blockdev brought it uptodate (the sct fix).
2119 for (i = 0; i < nr; i++) {
2120 bh = arr[i];
2121 if (buffer_uptodate(bh))
2122 end_buffer_async_read(bh, 1);
2123 else
2124 submit_bh(READ, bh);
2126 return 0;
2129 /* utility function for filesystems that need to do work on expanding
2130 * truncates. Uses prepare/commit_write to allow the filesystem to
2131 * deal with the hole.
2133 int generic_cont_expand(struct inode *inode, loff_t size)
2135 struct address_space *mapping = inode->i_mapping;
2136 struct page *page;
2137 unsigned long index, offset, limit;
2138 int err;
2140 err = -EFBIG;
2141 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2142 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2143 send_sig(SIGXFSZ, current, 0);
2144 goto out;
2146 if (size > inode->i_sb->s_maxbytes)
2147 goto out;
2149 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2151 /* ugh. in prepare/commit_write, if from==to==start of block, we
2152 ** skip the prepare. make sure we never send an offset for the start
2153 ** of a block
2155 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2156 offset++;
2158 index = size >> PAGE_CACHE_SHIFT;
2159 err = -ENOMEM;
2160 page = grab_cache_page(mapping, index);
2161 if (!page)
2162 goto out;
2163 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2164 if (!err) {
2165 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2167 unlock_page(page);
2168 page_cache_release(page);
2169 if (err > 0)
2170 err = 0;
2171 out:
2172 return err;
2176 * For moronic filesystems that do not allow holes in file.
2177 * We may have to extend the file.
2180 int cont_prepare_write(struct page *page, unsigned offset,
2181 unsigned to, get_block_t *get_block, loff_t *bytes)
2183 struct address_space *mapping = page->mapping;
2184 struct inode *inode = mapping->host;
2185 struct page *new_page;
2186 unsigned long pgpos;
2187 long status;
2188 unsigned zerofrom;
2189 unsigned blocksize = 1 << inode->i_blkbits;
2190 void *kaddr;
2192 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2193 status = -ENOMEM;
2194 new_page = grab_cache_page(mapping, pgpos);
2195 if (!new_page)
2196 goto out;
2197 /* we might sleep */
2198 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2199 unlock_page(new_page);
2200 page_cache_release(new_page);
2201 continue;
2203 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2204 if (zerofrom & (blocksize-1)) {
2205 *bytes |= (blocksize-1);
2206 (*bytes)++;
2208 status = __block_prepare_write(inode, new_page, zerofrom,
2209 PAGE_CACHE_SIZE, get_block);
2210 if (status)
2211 goto out_unmap;
2212 kaddr = kmap_atomic(new_page, KM_USER0);
2213 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2214 flush_dcache_page(new_page);
2215 kunmap_atomic(kaddr, KM_USER0);
2216 __block_commit_write(inode, new_page,
2217 zerofrom, PAGE_CACHE_SIZE);
2218 unlock_page(new_page);
2219 page_cache_release(new_page);
2222 if (page->index < pgpos) {
2223 /* completely inside the area */
2224 zerofrom = offset;
2225 } else {
2226 /* page covers the boundary, find the boundary offset */
2227 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2229 /* if we will expand the thing last block will be filled */
2230 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2231 *bytes |= (blocksize-1);
2232 (*bytes)++;
2235 /* starting below the boundary? Nothing to zero out */
2236 if (offset <= zerofrom)
2237 zerofrom = offset;
2239 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2240 if (status)
2241 goto out1;
2242 if (zerofrom < offset) {
2243 kaddr = kmap_atomic(page, KM_USER0);
2244 memset(kaddr+zerofrom, 0, offset-zerofrom);
2245 flush_dcache_page(page);
2246 kunmap_atomic(kaddr, KM_USER0);
2247 __block_commit_write(inode, page, zerofrom, offset);
2249 return 0;
2250 out1:
2251 ClearPageUptodate(page);
2252 return status;
2254 out_unmap:
2255 ClearPageUptodate(new_page);
2256 unlock_page(new_page);
2257 page_cache_release(new_page);
2258 out:
2259 return status;
2262 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2263 get_block_t *get_block)
2265 struct inode *inode = page->mapping->host;
2266 int err = __block_prepare_write(inode, page, from, to, get_block);
2267 if (err)
2268 ClearPageUptodate(page);
2269 return err;
2272 int block_commit_write(struct page *page, unsigned from, unsigned to)
2274 struct inode *inode = page->mapping->host;
2275 __block_commit_write(inode,page,from,to);
2276 return 0;
2279 int generic_commit_write(struct file *file, struct page *page,
2280 unsigned from, unsigned to)
2282 struct inode *inode = page->mapping->host;
2283 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2284 __block_commit_write(inode,page,from,to);
2286 * No need to use i_size_read() here, the i_size
2287 * cannot change under us because we hold i_sem.
2289 if (pos > inode->i_size) {
2290 i_size_write(inode, pos);
2291 mark_inode_dirty(inode);
2293 return 0;
2297 * On entry, the page is fully not uptodate.
2298 * On exit the page is fully uptodate in the areas outside (from,to)
2300 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2301 get_block_t *get_block)
2303 struct inode *inode = page->mapping->host;
2304 const unsigned blkbits = inode->i_blkbits;
2305 const unsigned blocksize = 1 << blkbits;
2306 struct buffer_head map_bh;
2307 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2308 unsigned block_in_page;
2309 unsigned block_start;
2310 sector_t block_in_file;
2311 char *kaddr;
2312 int nr_reads = 0;
2313 int i;
2314 int ret = 0;
2315 int is_mapped_to_disk = 1;
2316 int dirtied_it = 0;
2318 if (PageMappedToDisk(page))
2319 return 0;
2321 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2322 map_bh.b_page = page;
2325 * We loop across all blocks in the page, whether or not they are
2326 * part of the affected region. This is so we can discover if the
2327 * page is fully mapped-to-disk.
2329 for (block_start = 0, block_in_page = 0;
2330 block_start < PAGE_CACHE_SIZE;
2331 block_in_page++, block_start += blocksize) {
2332 unsigned block_end = block_start + blocksize;
2333 int create;
2335 map_bh.b_state = 0;
2336 create = 1;
2337 if (block_start >= to)
2338 create = 0;
2339 ret = get_block(inode, block_in_file + block_in_page,
2340 &map_bh, create);
2341 if (ret)
2342 goto failed;
2343 if (!buffer_mapped(&map_bh))
2344 is_mapped_to_disk = 0;
2345 if (buffer_new(&map_bh))
2346 unmap_underlying_metadata(map_bh.b_bdev,
2347 map_bh.b_blocknr);
2348 if (PageUptodate(page))
2349 continue;
2350 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2351 kaddr = kmap_atomic(page, KM_USER0);
2352 if (block_start < from) {
2353 memset(kaddr+block_start, 0, from-block_start);
2354 dirtied_it = 1;
2356 if (block_end > to) {
2357 memset(kaddr + to, 0, block_end - to);
2358 dirtied_it = 1;
2360 flush_dcache_page(page);
2361 kunmap_atomic(kaddr, KM_USER0);
2362 continue;
2364 if (buffer_uptodate(&map_bh))
2365 continue; /* reiserfs does this */
2366 if (block_start < from || block_end > to) {
2367 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2369 if (!bh) {
2370 ret = -ENOMEM;
2371 goto failed;
2373 bh->b_state = map_bh.b_state;
2374 atomic_set(&bh->b_count, 0);
2375 bh->b_this_page = 0;
2376 bh->b_page = page;
2377 bh->b_blocknr = map_bh.b_blocknr;
2378 bh->b_size = blocksize;
2379 bh->b_data = (char *)(long)block_start;
2380 bh->b_bdev = map_bh.b_bdev;
2381 bh->b_private = NULL;
2382 read_bh[nr_reads++] = bh;
2386 if (nr_reads) {
2387 ll_rw_block(READ, nr_reads, read_bh);
2388 for (i = 0; i < nr_reads; i++) {
2389 wait_on_buffer(read_bh[i]);
2390 if (!buffer_uptodate(read_bh[i]))
2391 ret = -EIO;
2392 free_buffer_head(read_bh[i]);
2393 read_bh[i] = NULL;
2395 if (ret)
2396 goto failed;
2399 if (is_mapped_to_disk)
2400 SetPageMappedToDisk(page);
2401 SetPageUptodate(page);
2404 * Setting the page dirty here isn't necessary for the prepare_write
2405 * function - commit_write will do that. But if/when this function is
2406 * used within the pagefault handler to ensure that all mmapped pages
2407 * have backing space in the filesystem, we will need to dirty the page
2408 * if its contents were altered.
2410 if (dirtied_it)
2411 set_page_dirty(page);
2413 return 0;
2415 failed:
2416 for (i = 0; i < nr_reads; i++) {
2417 if (read_bh[i])
2418 free_buffer_head(read_bh[i]);
2422 * Error recovery is pretty slack. Clear the page and mark it dirty
2423 * so we'll later zero out any blocks which _were_ allocated.
2425 kaddr = kmap_atomic(page, KM_USER0);
2426 memset(kaddr, 0, PAGE_CACHE_SIZE);
2427 kunmap_atomic(kaddr, KM_USER0);
2428 SetPageUptodate(page);
2429 set_page_dirty(page);
2430 return ret;
2432 EXPORT_SYMBOL(nobh_prepare_write);
2434 int nobh_commit_write(struct file *file, struct page *page,
2435 unsigned from, unsigned to)
2437 struct inode *inode = page->mapping->host;
2438 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2440 set_page_dirty(page);
2441 if (pos > inode->i_size) {
2442 i_size_write(inode, pos);
2443 mark_inode_dirty(inode);
2445 return 0;
2447 EXPORT_SYMBOL(nobh_commit_write);
2450 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2452 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2454 struct inode *inode = mapping->host;
2455 unsigned blocksize = 1 << inode->i_blkbits;
2456 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2457 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2458 unsigned to;
2459 struct page *page;
2460 struct address_space_operations *a_ops = mapping->a_ops;
2461 char *kaddr;
2462 int ret = 0;
2464 if ((offset & (blocksize - 1)) == 0)
2465 goto out;
2467 ret = -ENOMEM;
2468 page = grab_cache_page(mapping, index);
2469 if (!page)
2470 goto out;
2472 to = (offset + blocksize) & ~(blocksize - 1);
2473 ret = a_ops->prepare_write(NULL, page, offset, to);
2474 if (ret == 0) {
2475 kaddr = kmap_atomic(page, KM_USER0);
2476 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2477 flush_dcache_page(page);
2478 kunmap_atomic(kaddr, KM_USER0);
2479 set_page_dirty(page);
2481 unlock_page(page);
2482 page_cache_release(page);
2483 out:
2484 return ret;
2486 EXPORT_SYMBOL(nobh_truncate_page);
2488 int block_truncate_page(struct address_space *mapping,
2489 loff_t from, get_block_t *get_block)
2491 unsigned long index = from >> PAGE_CACHE_SHIFT;
2492 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2493 unsigned blocksize, iblock, length, pos;
2494 struct inode *inode = mapping->host;
2495 struct page *page;
2496 struct buffer_head *bh;
2497 void *kaddr;
2498 int err;
2500 blocksize = 1 << inode->i_blkbits;
2501 length = offset & (blocksize - 1);
2503 /* Block boundary? Nothing to do */
2504 if (!length)
2505 return 0;
2507 length = blocksize - length;
2508 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2510 page = grab_cache_page(mapping, index);
2511 err = -ENOMEM;
2512 if (!page)
2513 goto out;
2515 if (!page_has_buffers(page))
2516 create_empty_buffers(page, blocksize, 0);
2518 /* Find the buffer that contains "offset" */
2519 bh = page_buffers(page);
2520 pos = blocksize;
2521 while (offset >= pos) {
2522 bh = bh->b_this_page;
2523 iblock++;
2524 pos += blocksize;
2527 err = 0;
2528 if (!buffer_mapped(bh)) {
2529 err = get_block(inode, iblock, bh, 0);
2530 if (err)
2531 goto unlock;
2532 /* unmapped? It's a hole - nothing to do */
2533 if (!buffer_mapped(bh))
2534 goto unlock;
2537 /* Ok, it's mapped. Make sure it's up-to-date */
2538 if (PageUptodate(page))
2539 set_buffer_uptodate(bh);
2541 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2542 err = -EIO;
2543 ll_rw_block(READ, 1, &bh);
2544 wait_on_buffer(bh);
2545 /* Uhhuh. Read error. Complain and punt. */
2546 if (!buffer_uptodate(bh))
2547 goto unlock;
2550 kaddr = kmap_atomic(page, KM_USER0);
2551 memset(kaddr + offset, 0, length);
2552 flush_dcache_page(page);
2553 kunmap_atomic(kaddr, KM_USER0);
2555 mark_buffer_dirty(bh);
2556 err = 0;
2558 unlock:
2559 unlock_page(page);
2560 page_cache_release(page);
2561 out:
2562 return err;
2566 * The generic ->writepage function for buffer-backed address_spaces
2568 int block_write_full_page(struct page *page, get_block_t *get_block,
2569 struct writeback_control *wbc)
2571 struct inode * const inode = page->mapping->host;
2572 loff_t i_size = i_size_read(inode);
2573 const unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2574 unsigned offset;
2575 void *kaddr;
2577 /* Is the page fully inside i_size? */
2578 if (page->index < end_index)
2579 return __block_write_full_page(inode, page, get_block, wbc);
2581 /* Is the page fully outside i_size? (truncate in progress) */
2582 offset = i_size & (PAGE_CACHE_SIZE-1);
2583 if (page->index >= end_index+1 || !offset) {
2585 * The page may have dirty, unmapped buffers. For example,
2586 * they may have been added in ext3_writepage(). Make them
2587 * freeable here, so the page does not leak.
2589 block_invalidatepage(page, 0);
2590 unlock_page(page);
2591 return -EIO;
2595 * The page straddles i_size. It must be zeroed out on each and every
2596 * writepage invocation because it may be mmapped. "A file is mapped
2597 * in multiples of the page size. For a file that is not a multiple of
2598 * the page size, the remaining memory is zeroed when mapped, and
2599 * writes to that region are not written out to the file."
2601 kaddr = kmap_atomic(page, KM_USER0);
2602 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2603 flush_dcache_page(page);
2604 kunmap_atomic(kaddr, KM_USER0);
2605 return __block_write_full_page(inode, page, get_block, wbc);
2608 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2609 get_block_t *get_block)
2611 struct buffer_head tmp;
2612 struct inode *inode = mapping->host;
2613 tmp.b_state = 0;
2614 tmp.b_blocknr = 0;
2615 get_block(inode, block, &tmp, 0);
2616 return tmp.b_blocknr;
2619 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2621 struct buffer_head *bh = bio->bi_private;
2623 if (bio->bi_size)
2624 return 1;
2626 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2627 bio_put(bio);
2628 return 0;
2631 int submit_bh(int rw, struct buffer_head * bh)
2633 struct bio *bio;
2635 BUG_ON(!buffer_locked(bh));
2636 BUG_ON(!buffer_mapped(bh));
2637 BUG_ON(!bh->b_end_io);
2639 if ((rw == READ || rw == READA) && buffer_uptodate(bh))
2640 buffer_error();
2641 if (rw == WRITE && !buffer_uptodate(bh))
2642 buffer_error();
2643 if (rw == READ && buffer_dirty(bh))
2644 buffer_error();
2646 set_buffer_req(bh);
2649 * from here on down, it's all bio -- do the initial mapping,
2650 * submit_bio -> generic_make_request may further map this bio around
2652 bio = bio_alloc(GFP_NOIO, 1);
2654 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2655 bio->bi_bdev = bh->b_bdev;
2656 bio->bi_io_vec[0].bv_page = bh->b_page;
2657 bio->bi_io_vec[0].bv_len = bh->b_size;
2658 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2660 bio->bi_vcnt = 1;
2661 bio->bi_idx = 0;
2662 bio->bi_size = bh->b_size;
2664 bio->bi_end_io = end_bio_bh_io_sync;
2665 bio->bi_private = bh;
2667 return submit_bio(rw, bio);
2671 * ll_rw_block: low-level access to block devices (DEPRECATED)
2672 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2673 * @nr: number of &struct buffer_heads in the array
2674 * @bhs: array of pointers to &struct buffer_head
2676 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2677 * and requests an I/O operation on them, either a %READ or a %WRITE.
2678 * The third %READA option is described in the documentation for
2679 * generic_make_request() which ll_rw_block() calls.
2681 * This function drops any buffer that it cannot get a lock on (with the
2682 * BH_Lock state bit), any buffer that appears to be clean when doing a
2683 * write request, and any buffer that appears to be up-to-date when doing
2684 * read request. Further it marks as clean buffers that are processed for
2685 * writing (the buffer cache won't assume that they are actually clean until
2686 * the buffer gets unlocked).
2688 * ll_rw_block sets b_end_io to simple completion handler that marks
2689 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2690 * any waiters.
2692 * All of the buffers must be for the same device, and must also be a
2693 * multiple of the current approved size for the device.
2695 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2697 int i;
2699 for (i = 0; i < nr; i++) {
2700 struct buffer_head *bh = bhs[i];
2702 if (test_set_buffer_locked(bh))
2703 continue;
2705 get_bh(bh);
2706 bh->b_end_io = end_buffer_io_sync;
2707 if (rw == WRITE) {
2708 if (test_clear_buffer_dirty(bh)) {
2709 submit_bh(WRITE, bh);
2710 continue;
2712 } else {
2713 if (!buffer_uptodate(bh)) {
2714 submit_bh(rw, bh);
2715 continue;
2718 unlock_buffer(bh);
2719 put_bh(bh);
2724 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2725 * and then start new I/O and then wait upon it.
2727 void sync_dirty_buffer(struct buffer_head *bh)
2729 WARN_ON(atomic_read(&bh->b_count) < 1);
2730 lock_buffer(bh);
2731 if (test_clear_buffer_dirty(bh)) {
2732 get_bh(bh);
2733 bh->b_end_io = end_buffer_io_sync;
2734 submit_bh(WRITE, bh);
2735 wait_on_buffer(bh);
2736 } else {
2737 unlock_buffer(bh);
2742 * Sanity checks for try_to_free_buffers.
2744 static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
2746 if (!buffer_uptodate(bh) && !buffer_req(bh)) {
2747 if (PageUptodate(page) && page->mapping
2748 && buffer_mapped(bh) /* discard_buffer */
2749 && S_ISBLK(page->mapping->host->i_mode))
2751 buffer_error();
2757 * try_to_free_buffers() checks if all the buffers on this particular page
2758 * are unused, and releases them if so.
2760 * Exclusion against try_to_free_buffers may be obtained by either
2761 * locking the page or by holding its mapping's private_lock.
2763 * If the page is dirty but all the buffers are clean then we need to
2764 * be sure to mark the page clean as well. This is because the page
2765 * may be against a block device, and a later reattachment of buffers
2766 * to a dirty page will set *all* buffers dirty. Which would corrupt
2767 * filesystem data on the same device.
2769 * The same applies to regular filesystem pages: if all the buffers are
2770 * clean then we set the page clean and proceed. To do that, we require
2771 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2772 * private_lock.
2774 * try_to_free_buffers() is non-blocking.
2776 static inline int buffer_busy(struct buffer_head *bh)
2778 return atomic_read(&bh->b_count) |
2779 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2782 static int
2783 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2785 struct buffer_head *head = page_buffers(page);
2786 struct buffer_head *bh;
2787 int was_uptodate = 1;
2789 bh = head;
2790 do {
2791 check_ttfb_buffer(page, bh);
2792 if (buffer_busy(bh))
2793 goto failed;
2794 if (!buffer_uptodate(bh) && !buffer_req(bh))
2795 was_uptodate = 0;
2796 bh = bh->b_this_page;
2797 } while (bh != head);
2799 if (!was_uptodate && PageUptodate(page))
2800 buffer_error();
2802 do {
2803 struct buffer_head *next = bh->b_this_page;
2805 if (!list_empty(&bh->b_assoc_buffers))
2806 __remove_assoc_queue(bh);
2807 bh = next;
2808 } while (bh != head);
2809 *buffers_to_free = head;
2810 __clear_page_buffers(page);
2811 return 1;
2812 failed:
2813 return 0;
2816 int try_to_free_buffers(struct page *page)
2818 struct address_space * const mapping = page->mapping;
2819 struct buffer_head *buffers_to_free = NULL;
2820 int ret = 0;
2822 BUG_ON(!PageLocked(page));
2823 if (PageWriteback(page))
2824 return 0;
2826 if (mapping == NULL) { /* swapped-in anon page */
2827 ret = drop_buffers(page, &buffers_to_free);
2828 goto out;
2831 spin_lock(&mapping->private_lock);
2832 ret = drop_buffers(page, &buffers_to_free);
2833 if (ret && !PageSwapCache(page)) {
2835 * If the filesystem writes its buffers by hand (eg ext3)
2836 * then we can have clean buffers against a dirty page. We
2837 * clean the page here; otherwise later reattachment of buffers
2838 * could encounter a non-uptodate page, which is unresolvable.
2839 * This only applies in the rare case where try_to_free_buffers
2840 * succeeds but the page is not freed.
2842 clear_page_dirty(page);
2844 spin_unlock(&mapping->private_lock);
2845 out:
2846 if (buffers_to_free) {
2847 struct buffer_head *bh = buffers_to_free;
2849 do {
2850 struct buffer_head *next = bh->b_this_page;
2851 free_buffer_head(bh);
2852 bh = next;
2853 } while (bh != buffers_to_free);
2855 return ret;
2857 EXPORT_SYMBOL(try_to_free_buffers);
2859 int block_sync_page(struct page *page)
2861 blk_run_queues();
2862 return 0;
2866 * There are no bdflush tunables left. But distributions are
2867 * still running obsolete flush daemons, so we terminate them here.
2869 * Use of bdflush() is deprecated and will be removed in a future kernel.
2870 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2872 asmlinkage long sys_bdflush(int func, long data)
2874 static int msg_count;
2876 if (!capable(CAP_SYS_ADMIN))
2877 return -EPERM;
2879 if (msg_count < 5) {
2880 msg_count++;
2881 printk(KERN_INFO
2882 "warning: process `%s' used the obsolete bdflush"
2883 " system call\n", current->comm);
2884 printk(KERN_INFO "Fix your initscripts?\n");
2887 if (func == 1)
2888 do_exit(0);
2889 return 0;
2893 * Buffer-head allocation
2895 static kmem_cache_t *bh_cachep;
2898 * Once the number of bh's in the machine exceeds this level, we start
2899 * stripping them in writeback.
2901 static int max_buffer_heads;
2903 int buffer_heads_over_limit;
2905 struct bh_accounting {
2906 int nr; /* Number of live bh's */
2907 int ratelimit; /* Limit cacheline bouncing */
2910 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2912 static void recalc_bh_state(void)
2914 int i;
2915 int tot = 0;
2917 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2918 return;
2919 __get_cpu_var(bh_accounting).ratelimit = 0;
2920 for (i = 0; i < NR_CPUS; i++) {
2921 if (cpu_online(i))
2922 tot += per_cpu(bh_accounting, i).nr;
2924 buffer_heads_over_limit = (tot > max_buffer_heads);
2927 struct buffer_head *alloc_buffer_head(int gfp_flags)
2929 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2930 if (ret) {
2931 preempt_disable();
2932 __get_cpu_var(bh_accounting).nr++;
2933 recalc_bh_state();
2934 preempt_enable();
2936 return ret;
2938 EXPORT_SYMBOL(alloc_buffer_head);
2940 void free_buffer_head(struct buffer_head *bh)
2942 BUG_ON(!list_empty(&bh->b_assoc_buffers));
2943 kmem_cache_free(bh_cachep, bh);
2944 preempt_disable();
2945 __get_cpu_var(bh_accounting).nr--;
2946 recalc_bh_state();
2947 preempt_enable();
2949 EXPORT_SYMBOL(free_buffer_head);
2951 static void
2952 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
2954 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2955 SLAB_CTOR_CONSTRUCTOR) {
2956 struct buffer_head * bh = (struct buffer_head *)data;
2958 memset(bh, 0, sizeof(*bh));
2959 INIT_LIST_HEAD(&bh->b_assoc_buffers);
2963 static void buffer_init_cpu(int cpu)
2965 struct bh_accounting *bha = &per_cpu(bh_accounting, cpu);
2966 struct bh_lru *bhl = &per_cpu(bh_lrus, cpu);
2968 bha->nr = 0;
2969 bha->ratelimit = 0;
2970 memset(bhl, 0, sizeof(*bhl));
2973 static int __devinit buffer_cpu_notify(struct notifier_block *self,
2974 unsigned long action, void *hcpu)
2976 long cpu = (long)hcpu;
2977 switch(action) {
2978 case CPU_UP_PREPARE:
2979 buffer_init_cpu(cpu);
2980 break;
2981 default:
2982 break;
2984 return NOTIFY_OK;
2987 static struct notifier_block __devinitdata buffer_nb = {
2988 .notifier_call = buffer_cpu_notify,
2991 void __init buffer_init(void)
2993 int i;
2994 int nrpages;
2996 bh_cachep = kmem_cache_create("buffer_head",
2997 sizeof(struct buffer_head), 0,
2998 0, init_buffer_head, NULL);
2999 for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
3000 init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
3003 * Limit the bh occupancy to 10% of ZONE_NORMAL
3005 nrpages = (nr_free_buffer_pages() * 10) / 100;
3006 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3007 buffer_cpu_notify(&buffer_nb, (unsigned long)CPU_UP_PREPARE,
3008 (void *)(long)smp_processor_id());
3009 register_cpu_notifier(&buffer_nb);