Remove references to CONFIG_PROFILE. Kernel profiling is no longer a
[linux-2.6/linux-mips.git] / fs / buffer.c
blob432877cdd60c135ca1c0823ec60cfa20bba9bd6d
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
53 #define NR_SIZES 7
54 static char buffersize_index[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 6};
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
71 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
74 * Hash table gook..
76 static unsigned int bh_hash_mask;
77 static unsigned int bh_hash_shift;
78 static struct buffer_head **hash_table;
79 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
81 static struct buffer_head *lru_list[NR_LIST];
82 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
83 static int nr_buffers_type[NR_LIST];
84 static unsigned long size_buffers_type[NR_LIST];
86 static struct buffer_head * unused_list;
87 static int nr_unused_buffer_heads;
88 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
89 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
91 struct bh_free_head {
92 struct buffer_head *list;
93 spinlock_t lock;
95 static struct bh_free_head free_list[NR_SIZES];
97 kmem_cache_t *bh_cachep;
99 static int grow_buffers(int size);
100 static void __refile_buffer(struct buffer_head *);
102 /* This is used by some architectures to estimate available memory. */
103 atomic_t buffermem_pages = ATOMIC_INIT(0);
105 /* Here is the parameter block for the bdflush process. If you add or
106 * remove any of the parameters, make sure to update kernel/sysctl.c.
109 #define N_PARAM 9
111 /* The dummy values in this structure are left in there for compatibility
112 * with old programs that play with the /proc entries.
114 union bdflush_param {
115 struct {
116 int nfract; /* Percentage of buffer cache dirty to
117 activate bdflush */
118 int ndirty; /* Maximum number of dirty blocks to write out per
119 wake-cycle */
120 int nrefill; /* Number of clean buffers to try to obtain
121 each time we call refill */
122 int nref_dirt; /* Dirty buffer threshold for activating bdflush
123 when trying to refill buffers. */
124 int interval; /* jiffies delay between kupdate flushes */
125 int age_buffer; /* Time for normal buffer to age before we flush it */
126 int age_super; /* Time for superblock to age before we flush it */
127 int dummy2; /* unused */
128 int dummy3; /* unused */
129 } b_un;
130 unsigned int data[N_PARAM];
131 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
133 /* These are the min and max parameter values that we will allow to be assigned */
134 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
135 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
138 * Rewrote the wait-routines to use the "new" wait-queue functionality,
139 * and getting rid of the cli-sti pairs. The wait-queue routines still
140 * need cli-sti, but now it's just a couple of 386 instructions or so.
142 * Note that the real wait_on_buffer() is an inline function that checks
143 * if 'b_wait' is set before calling this, so that the queues aren't set
144 * up unnecessarily.
146 void __wait_on_buffer(struct buffer_head * bh)
148 struct task_struct *tsk = current;
149 DECLARE_WAITQUEUE(wait, tsk);
151 atomic_inc(&bh->b_count);
152 add_wait_queue(&bh->b_wait, &wait);
153 do {
154 run_task_queue(&tq_disk);
155 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
156 if (!buffer_locked(bh))
157 break;
158 schedule();
159 } while (buffer_locked(bh));
160 tsk->state = TASK_RUNNING;
161 remove_wait_queue(&bh->b_wait, &wait);
162 atomic_dec(&bh->b_count);
165 /* Call sync_buffers with wait!=0 to ensure that the call does not
166 * return until all buffer writes have completed. Sync() may return
167 * before the writes have finished; fsync() may not.
170 /* Godamity-damn. Some buffers (bitmaps for filesystems)
171 * spontaneously dirty themselves without ever brelse being called.
172 * We will ultimately want to put these in a separate list, but for
173 * now we search all of the lists for dirty buffers.
175 static int sync_buffers(kdev_t dev, int wait)
177 int i, retry, pass = 0, err = 0;
178 struct buffer_head * bh, *next;
180 /* One pass for no-wait, three for wait:
181 * 0) write out all dirty, unlocked buffers;
182 * 1) write out all dirty buffers, waiting if locked;
183 * 2) wait for completion by waiting for all buffers to unlock.
185 do {
186 retry = 0;
188 /* We search all lists as a failsafe mechanism, not because we expect
189 * there to be dirty buffers on any of the other lists.
191 repeat:
192 spin_lock(&lru_list_lock);
193 bh = lru_list[BUF_DIRTY];
194 if (!bh)
195 goto repeat2;
197 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
198 next = bh->b_next_free;
200 if (!lru_list[BUF_DIRTY])
201 break;
202 if (dev && bh->b_dev != dev)
203 continue;
204 if (buffer_locked(bh)) {
205 /* Buffer is locked; skip it unless wait is
206 * requested AND pass > 0.
208 if (!wait || !pass) {
209 retry = 1;
210 continue;
212 atomic_inc(&bh->b_count);
213 spin_unlock(&lru_list_lock);
214 wait_on_buffer (bh);
215 atomic_dec(&bh->b_count);
216 goto repeat;
219 /* If an unlocked buffer is not uptodate, there has
220 * been an IO error. Skip it.
222 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
223 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
224 err = -EIO;
225 continue;
228 /* Don't write clean buffers. Don't write ANY buffers
229 * on the third pass.
231 if (!buffer_dirty(bh) || pass >= 2)
232 continue;
234 atomic_inc(&bh->b_count);
235 spin_unlock(&lru_list_lock);
236 ll_rw_block(WRITE, 1, &bh);
237 atomic_dec(&bh->b_count);
238 retry = 1;
239 goto repeat;
242 repeat2:
243 bh = lru_list[BUF_LOCKED];
244 if (!bh) {
245 spin_unlock(&lru_list_lock);
246 break;
248 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
249 next = bh->b_next_free;
251 if (!lru_list[BUF_LOCKED])
252 break;
253 if (dev && bh->b_dev != dev)
254 continue;
255 if (buffer_locked(bh)) {
256 /* Buffer is locked; skip it unless wait is
257 * requested AND pass > 0.
259 if (!wait || !pass) {
260 retry = 1;
261 continue;
263 atomic_inc(&bh->b_count);
264 spin_unlock(&lru_list_lock);
265 wait_on_buffer (bh);
266 spin_lock(&lru_list_lock);
267 atomic_dec(&bh->b_count);
268 goto repeat2;
271 spin_unlock(&lru_list_lock);
273 /* If we are waiting for the sync to succeed, and if any dirty
274 * blocks were written, then repeat; on the second pass, only
275 * wait for buffers being written (do not pass to write any
276 * more buffers on the second pass).
278 } while (wait && retry && ++pass<=2);
279 return err;
282 void sync_dev(kdev_t dev)
284 sync_supers(dev);
285 sync_inodes(dev);
286 DQUOT_SYNC(dev);
287 /* sync all the dirty buffers out to disk only _after_ all the
288 high level layers finished generated buffer dirty data
289 (or we'll return with some buffer still dirty on the blockdevice
290 so breaking the semantics of this call) */
291 sync_buffers(dev, 0);
293 * FIXME(eric) we need to sync the physical devices here.
294 * This is because some (scsi) controllers have huge amounts of
295 * cache onboard (hundreds of Mb), and we need to instruct
296 * them to commit all of the dirty memory to disk, and we should
297 * not return until this has happened.
299 * This would need to get implemented by going through the assorted
300 * layers so that each block major number can be synced, and this
301 * would call down into the upper and mid-layer scsi.
305 int fsync_dev(kdev_t dev)
307 sync_buffers(dev, 0);
309 lock_kernel();
310 sync_supers(dev);
311 sync_inodes(dev);
312 DQUOT_SYNC(dev);
313 unlock_kernel();
315 return sync_buffers(dev, 1);
318 asmlinkage long sys_sync(void)
320 fsync_dev(0);
321 return 0;
325 * filp may be NULL if called via the msync of a vma.
328 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
330 struct inode * inode = dentry->d_inode;
331 struct super_block * sb;
332 kdev_t dev;
333 int ret;
335 lock_kernel();
336 /* sync the inode to buffers */
337 write_inode_now(inode, 0);
339 /* sync the superblock to buffers */
340 sb = inode->i_sb;
341 wait_on_super(sb);
342 if (sb->s_op && sb->s_op->write_super)
343 sb->s_op->write_super(sb);
345 /* .. finally sync the buffers to disk */
346 dev = inode->i_dev;
347 ret = sync_buffers(dev, 1);
348 unlock_kernel();
349 return ret;
352 asmlinkage long sys_fsync(unsigned int fd)
354 struct file * file;
355 struct dentry * dentry;
356 struct inode * inode;
357 int err;
359 err = -EBADF;
360 file = fget(fd);
361 if (!file)
362 goto out;
364 dentry = file->f_dentry;
365 inode = dentry->d_inode;
367 err = -EINVAL;
368 if (!file->f_op || !file->f_op->fsync)
369 goto out_putf;
371 /* We need to protect against concurrent writers.. */
372 down(&inode->i_sem);
373 err = file->f_op->fsync(file, dentry, 0);
374 up(&inode->i_sem);
376 out_putf:
377 fput(file);
378 out:
379 return err;
382 asmlinkage long sys_fdatasync(unsigned int fd)
384 struct file * file;
385 struct dentry * dentry;
386 struct inode * inode;
387 int err;
389 err = -EBADF;
390 file = fget(fd);
391 if (!file)
392 goto out;
394 dentry = file->f_dentry;
395 inode = dentry->d_inode;
397 err = -EINVAL;
398 if (!file->f_op || !file->f_op->fsync)
399 goto out_putf;
401 down(&inode->i_sem);
402 err = file->f_op->fsync(file, dentry, 1);
403 up(&inode->i_sem);
405 out_putf:
406 fput(file);
407 out:
408 return err;
411 /* After several hours of tedious analysis, the following hash
412 * function won. Do not mess with it... -DaveM
414 #define _hashfn(dev,block) \
415 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
416 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
417 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
419 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
421 if ((bh->b_next = *head) != NULL)
422 bh->b_next->b_pprev = &bh->b_next;
423 *head = bh;
424 bh->b_pprev = head;
427 static __inline__ void __hash_unlink(struct buffer_head *bh)
429 if (bh->b_pprev) {
430 if (bh->b_next)
431 bh->b_next->b_pprev = bh->b_pprev;
432 *(bh->b_pprev) = bh->b_next;
433 bh->b_pprev = NULL;
437 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
439 struct buffer_head **bhp = &lru_list[blist];
441 if(!*bhp) {
442 *bhp = bh;
443 bh->b_prev_free = bh;
445 bh->b_next_free = *bhp;
446 bh->b_prev_free = (*bhp)->b_prev_free;
447 (*bhp)->b_prev_free->b_next_free = bh;
448 (*bhp)->b_prev_free = bh;
449 nr_buffers_type[blist]++;
450 size_buffers_type[blist] += bh->b_size;
453 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
455 if (bh->b_prev_free || bh->b_next_free) {
456 bh->b_prev_free->b_next_free = bh->b_next_free;
457 bh->b_next_free->b_prev_free = bh->b_prev_free;
458 if (lru_list[blist] == bh)
459 lru_list[blist] = bh->b_next_free;
460 if (lru_list[blist] == bh)
461 lru_list[blist] = NULL;
462 bh->b_next_free = bh->b_prev_free = NULL;
463 nr_buffers_type[blist]--;
464 size_buffers_type[blist] -= bh->b_size;
468 static void __remove_from_free_list(struct buffer_head * bh, int index)
470 if(bh->b_next_free == bh)
471 free_list[index].list = NULL;
472 else {
473 bh->b_prev_free->b_next_free = bh->b_next_free;
474 bh->b_next_free->b_prev_free = bh->b_prev_free;
475 if (free_list[index].list == bh)
476 free_list[index].list = bh->b_next_free;
478 bh->b_next_free = bh->b_prev_free = NULL;
481 /* must be called with both the hash_table_lock and the lru_list_lock
482 held */
483 static void __remove_from_queues(struct buffer_head *bh)
485 __hash_unlink(bh);
486 __remove_from_lru_list(bh, bh->b_list);
489 static void insert_into_queues(struct buffer_head *bh)
491 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
493 spin_lock(&lru_list_lock);
494 write_lock(&hash_table_lock);
495 __hash_link(bh, head);
496 __insert_into_lru_list(bh, bh->b_list);
497 write_unlock(&hash_table_lock);
498 spin_unlock(&lru_list_lock);
501 /* This function must only run if there are no other
502 * references _anywhere_ to this buffer head.
504 static void put_last_free(struct buffer_head * bh)
506 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
507 struct buffer_head **bhp = &head->list;
509 bh->b_state = 0;
511 spin_lock(&head->lock);
512 bh->b_dev = B_FREE;
513 if(!*bhp) {
514 *bhp = bh;
515 bh->b_prev_free = bh;
517 bh->b_next_free = *bhp;
518 bh->b_prev_free = (*bhp)->b_prev_free;
519 (*bhp)->b_prev_free->b_next_free = bh;
520 (*bhp)->b_prev_free = bh;
521 spin_unlock(&head->lock);
525 * Why like this, I hear you say... The reason is race-conditions.
526 * As we don't lock buffers (unless we are reading them, that is),
527 * something might happen to it while we sleep (ie a read-error
528 * will force it bad). This shouldn't really happen currently, but
529 * the code is ready. */
530 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
532 struct buffer_head **head = &hash(dev, block);
533 struct buffer_head *bh;
535 read_lock(&hash_table_lock);
536 for(bh = *head; bh; bh = bh->b_next)
537 if (bh->b_blocknr == block &&
538 bh->b_size == size &&
539 bh->b_dev == dev)
540 break;
541 if (bh)
542 atomic_inc(&bh->b_count);
543 read_unlock(&hash_table_lock);
545 return bh;
548 unsigned int get_hardblocksize(kdev_t dev)
551 * Get the hard sector size for the given device. If we don't know
552 * what it is, return 0.
554 if (hardsect_size[MAJOR(dev)] != NULL) {
555 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
556 if (blksize != 0)
557 return blksize;
561 * We don't know what the hardware sector size for this device is.
562 * Return 0 indicating that we don't know.
564 return 0;
567 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
569 spin_lock(&lru_list_lock);
570 if (bh->b_inode)
571 list_del(&bh->b_inode_buffers);
572 bh->b_inode = inode;
573 list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
574 spin_unlock(&lru_list_lock);
577 /* The caller must have the lru_list lock before calling the
578 remove_inode_queue functions. */
579 static void __remove_inode_queue(struct buffer_head *bh)
581 bh->b_inode = NULL;
582 list_del(&bh->b_inode_buffers);
585 static inline void remove_inode_queue(struct buffer_head *bh)
587 if (bh->b_inode)
588 __remove_inode_queue(bh);
591 int inode_has_buffers(struct inode *inode)
593 int ret;
595 spin_lock(&lru_list_lock);
596 ret = !list_empty(&inode->i_dirty_buffers);
597 spin_unlock(&lru_list_lock);
599 return ret;
603 /* If invalidate_buffers() will trash dirty buffers, it means some kind
604 of fs corruption is going on. Trashing dirty data always imply losing
605 information that was supposed to be just stored on the physical layer
606 by the user.
608 Thus invalidate_buffers in general usage is not allwowed to trash dirty
609 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
611 NOTE: In the case where the user removed a removable-media-disk even if
612 there's still dirty data not synced on disk (due a bug in the device driver
613 or due an error of the user), by not destroying the dirty buffers we could
614 generate corruption also on the next media inserted, thus a parameter is
615 necessary to handle this case in the most safe way possible (trying
616 to not corrupt also the new disk inserted with the data belonging to
617 the old now corrupted disk). Also for the ramdisk the natural thing
618 to do in order to release the ramdisk memory is to destroy dirty buffers.
620 These are two special cases. Normal usage imply the device driver
621 to issue a sync on the device (without waiting I/O completation) and
622 then an invalidate_buffers call that doesn't trashes dirty buffers. */
623 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
625 int i, nlist, slept;
626 struct buffer_head * bh, * bh_next;
628 retry:
629 slept = 0;
630 spin_lock(&lru_list_lock);
631 for(nlist = 0; nlist < NR_LIST; nlist++) {
632 bh = lru_list[nlist];
633 if (!bh)
634 continue;
635 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
636 bh_next = bh->b_next_free;
637 if (bh->b_dev != dev)
638 continue;
639 if (buffer_locked(bh)) {
640 atomic_inc(&bh->b_count);
641 spin_unlock(&lru_list_lock);
642 wait_on_buffer(bh);
643 slept = 1;
644 spin_lock(&lru_list_lock);
645 atomic_dec(&bh->b_count);
648 write_lock(&hash_table_lock);
649 if (!atomic_read(&bh->b_count) &&
650 (destroy_dirty_buffers || !buffer_dirty(bh))) {
651 __remove_from_queues(bh);
652 put_last_free(bh);
654 write_unlock(&hash_table_lock);
655 if (slept)
656 goto out;
659 out:
660 spin_unlock(&lru_list_lock);
661 if (slept)
662 goto retry;
665 void set_blocksize(kdev_t dev, int size)
667 extern int *blksize_size[];
668 int i, nlist, slept;
669 struct buffer_head * bh, * bh_next;
671 if (!blksize_size[MAJOR(dev)])
672 return;
674 /* Size must be a power of two, and between 512 and PAGE_SIZE */
675 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
676 panic("Invalid blocksize passed to set_blocksize");
678 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
679 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
680 return;
682 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
683 return;
684 sync_buffers(dev, 2);
685 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
687 retry:
688 slept = 0;
689 spin_lock(&lru_list_lock);
690 for(nlist = 0; nlist < NR_LIST; nlist++) {
691 bh = lru_list[nlist];
692 if (!bh)
693 continue;
694 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
695 bh_next = bh->b_next_free;
696 if (bh->b_dev != dev || bh->b_size == size)
697 continue;
698 if (buffer_locked(bh)) {
699 atomic_inc(&bh->b_count);
700 spin_unlock(&lru_list_lock);
701 wait_on_buffer(bh);
702 slept = 1;
703 spin_lock(&lru_list_lock);
704 atomic_dec(&bh->b_count);
707 write_lock(&hash_table_lock);
708 if (!atomic_read(&bh->b_count)) {
709 if (buffer_dirty(bh))
710 printk(KERN_WARNING
711 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
712 kdevname(dev), bh->b_blocknr, bh->b_size);
713 __remove_from_queues(bh);
714 put_last_free(bh);
715 } else {
716 if (atomic_set_buffer_clean(bh))
717 __refile_buffer(bh);
718 clear_bit(BH_Uptodate, &bh->b_state);
719 printk(KERN_WARNING
720 "set_blocksize: "
721 "b_count %d, dev %s, block %lu, from %p\n",
722 atomic_read(&bh->b_count), bdevname(bh->b_dev),
723 bh->b_blocknr, __builtin_return_address(0));
725 write_unlock(&hash_table_lock);
726 if (slept)
727 goto out;
730 out:
731 spin_unlock(&lru_list_lock);
732 if (slept)
733 goto retry;
737 * We used to try various strange things. Let's not.
739 static void refill_freelist(int size)
741 if (!grow_buffers(size)) {
742 wakeup_bdflush(1);
743 current->policy |= SCHED_YIELD;
744 schedule();
748 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
750 bh->b_list = BUF_CLEAN;
751 bh->b_end_io = handler;
752 bh->b_dev_id = dev_id;
755 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
757 mark_buffer_uptodate(bh, uptodate);
758 unlock_buffer(bh);
761 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
763 mark_buffer_uptodate(bh, uptodate);
764 unlock_buffer(bh);
765 BUG();
768 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
770 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
771 unsigned long flags;
772 struct buffer_head *tmp;
773 struct page *page;
775 mark_buffer_uptodate(bh, uptodate);
777 /* This is a temporary buffer used for page I/O. */
778 page = bh->b_page;
780 if (!uptodate)
781 SetPageError(page);
784 * Be _very_ careful from here on. Bad things can happen if
785 * two buffer heads end IO at almost the same time and both
786 * decide that the page is now completely done.
788 * Async buffer_heads are here only as labels for IO, and get
789 * thrown away once the IO for this page is complete. IO is
790 * deemed complete once all buffers have been visited
791 * (b_count==0) and are now unlocked. We must make sure that
792 * only the _last_ buffer that decrements its count is the one
793 * that unlock the page..
795 spin_lock_irqsave(&page_uptodate_lock, flags);
796 unlock_buffer(bh);
797 atomic_dec(&bh->b_count);
798 tmp = bh->b_this_page;
799 while (tmp != bh) {
800 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
801 goto still_busy;
802 tmp = tmp->b_this_page;
805 /* OK, the async IO on this page is complete. */
806 spin_unlock_irqrestore(&page_uptodate_lock, flags);
809 * if none of the buffers had errors then we can set the
810 * page uptodate:
812 if (!PageError(page))
813 SetPageUptodate(page);
816 * Run the hooks that have to be done when a page I/O has completed.
818 if (PageTestandClearDecrAfter(page))
819 atomic_dec(&nr_async_pages);
821 UnlockPage(page);
823 return;
825 still_busy:
826 spin_unlock_irqrestore(&page_uptodate_lock, flags);
827 return;
832 * Synchronise all the inode's dirty buffers to the disk.
834 * We have conflicting pressures: we want to make sure that all
835 * initially dirty buffers get waited on, but that any subsequently
836 * dirtied buffers don't. After all, we don't want fsync to last
837 * forever if somebody is actively writing to the file.
839 * Do this in two main stages: first we copy dirty buffers to a
840 * temporary inode list, queueing the writes as we go. Then we clean
841 * up, waiting for those writes to complete.
843 * During this second stage, any subsequent updates to the file may end
844 * up refiling the buffer on the original inode's dirty list again, so
845 * there is a chance we will end up with a buffer queued for write but
846 * not yet completed on that list. So, as a final cleanup we go through
847 * the osync code to catch these locked, dirty buffers without requeuing
848 * any newly dirty buffers for write.
851 int fsync_inode_buffers(struct inode *inode)
853 struct buffer_head *bh;
854 struct inode tmp;
855 int err = 0, err2;
857 INIT_LIST_HEAD(&tmp.i_dirty_buffers);
859 spin_lock(&lru_list_lock);
861 while (!list_empty(&inode->i_dirty_buffers)) {
862 bh = BH_ENTRY(inode->i_dirty_buffers.next);
863 list_del(&bh->b_inode_buffers);
864 if (!buffer_dirty(bh) && !buffer_locked(bh))
865 bh->b_inode = NULL;
866 else {
867 bh->b_inode = &tmp;
868 list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
869 atomic_inc(&bh->b_count);
870 if (buffer_dirty(bh)) {
871 spin_unlock(&lru_list_lock);
872 ll_rw_block(WRITE, 1, &bh);
873 spin_lock(&lru_list_lock);
878 while (!list_empty(&tmp.i_dirty_buffers)) {
879 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
880 remove_inode_queue(bh);
881 spin_unlock(&lru_list_lock);
882 wait_on_buffer(bh);
883 if (!buffer_uptodate(bh))
884 err = -EIO;
885 brelse(bh);
886 spin_lock(&lru_list_lock);
889 spin_unlock(&lru_list_lock);
890 err2 = osync_inode_buffers(inode);
892 if (err)
893 return err;
894 else
895 return err2;
900 * osync is designed to support O_SYNC io. It waits synchronously for
901 * all already-submitted IO to complete, but does not queue any new
902 * writes to the disk.
904 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
905 * you dirty the buffers, and then use osync_inode_buffers to wait for
906 * completion. Any other dirty buffers which are not yet queued for
907 * write will not be flushed to disk by the osync.
910 int osync_inode_buffers(struct inode *inode)
912 struct buffer_head *bh;
913 struct list_head *list;
914 int err = 0;
916 spin_lock(&lru_list_lock);
918 repeat:
920 for (list = inode->i_dirty_buffers.prev;
921 bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
922 list = bh->b_inode_buffers.prev) {
923 if (buffer_locked(bh)) {
924 atomic_inc(&bh->b_count);
925 spin_unlock(&lru_list_lock);
926 wait_on_buffer(bh);
927 brelse(bh);
928 if (!buffer_uptodate(bh))
929 err = -EIO;
930 spin_lock(&lru_list_lock);
931 goto repeat;
935 spin_unlock(&lru_list_lock);
936 return err;
941 * Invalidate any and all dirty buffers on a given inode. We are
942 * probably unmounting the fs, but that doesn't mean we have already
943 * done a sync(). Just drop the buffers from the inode list.
946 void invalidate_inode_buffers(struct inode *inode)
948 struct list_head *list, *next;
950 spin_lock(&lru_list_lock);
951 list = inode->i_dirty_buffers.next;
952 while (list != &inode->i_dirty_buffers) {
953 next = list->next;
954 remove_inode_queue(BH_ENTRY(list));
955 list = next;
957 spin_unlock(&lru_list_lock);
962 * Ok, this is getblk, and it isn't very clear, again to hinder
963 * race-conditions. Most of the code is seldom used, (ie repeating),
964 * so it should be much more efficient than it looks.
966 * The algorithm is changed: hopefully better, and an elusive bug removed.
968 * 14.02.92: changed it to sync dirty buffers a bit: better performance
969 * when the filesystem starts to get full of dirty blocks (I hope).
971 struct buffer_head * getblk(kdev_t dev, int block, int size)
973 struct buffer_head * bh;
974 int isize;
976 repeat:
977 bh = get_hash_table(dev, block, size);
978 if (bh)
979 goto out;
981 isize = BUFSIZE_INDEX(size);
982 spin_lock(&free_list[isize].lock);
983 bh = free_list[isize].list;
984 if (bh) {
985 __remove_from_free_list(bh, isize);
986 atomic_set(&bh->b_count, 1);
988 spin_unlock(&free_list[isize].lock);
991 * OK, FINALLY we know that this buffer is the only one of
992 * its kind, we hold a reference (b_count>0), it is unlocked,
993 * and it is clean.
995 if (bh) {
996 init_buffer(bh, end_buffer_io_sync, NULL);
997 bh->b_dev = dev;
998 bh->b_blocknr = block;
999 bh->b_state = 1 << BH_Mapped;
1001 /* Insert the buffer into the regular lists */
1002 insert_into_queues(bh);
1003 out:
1004 touch_buffer(bh);
1005 return bh;
1009 * If we block while refilling the free list, somebody may
1010 * create the buffer first ... search the hashes again.
1012 refill_freelist(size);
1013 goto repeat;
1016 /* -1 -> no need to flush
1017 0 -> async flush
1018 1 -> sync flush (wait for I/O completation) */
1019 static int balance_dirty_state(kdev_t dev)
1021 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1023 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1024 tot = nr_free_buffer_pages();
1025 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
1027 dirty *= 200;
1028 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1029 hard_dirty_limit = soft_dirty_limit * 2;
1031 if (dirty > soft_dirty_limit) {
1032 if (dirty > hard_dirty_limit)
1033 return 1;
1034 return 0;
1036 return -1;
1040 * if a new dirty buffer is created we need to balance bdflush.
1042 * in the future we might want to make bdflush aware of different
1043 * pressures on different devices - thus the (currently unused)
1044 * 'dev' parameter.
1046 void balance_dirty(kdev_t dev)
1048 int state = balance_dirty_state(dev);
1050 if (state < 0)
1051 return;
1052 wakeup_bdflush(state);
1055 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
1057 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
1058 refile_buffer(bh);
1061 /* atomic version, the user must call balance_dirty() by hand
1062 as soon as it become possible to block */
1063 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
1065 if (!atomic_set_buffer_dirty(bh))
1066 __mark_dirty(bh, flag);
1069 void mark_buffer_dirty(struct buffer_head *bh, int flag)
1071 __mark_buffer_dirty(bh, flag);
1072 balance_dirty(bh->b_dev);
1076 * A buffer may need to be moved from one buffer list to another
1077 * (e.g. in case it is not shared any more). Handle this.
1079 static void __refile_buffer(struct buffer_head *bh)
1081 int dispose = BUF_CLEAN;
1082 if (buffer_locked(bh))
1083 dispose = BUF_LOCKED;
1084 if (buffer_dirty(bh))
1085 dispose = BUF_DIRTY;
1086 if (buffer_protected(bh))
1087 dispose = BUF_PROTECTED;
1088 if (dispose != bh->b_list) {
1089 __remove_from_lru_list(bh, bh->b_list);
1090 bh->b_list = dispose;
1091 __insert_into_lru_list(bh, dispose);
1092 if (dispose == BUF_CLEAN)
1093 remove_inode_queue(bh);
1097 void refile_buffer(struct buffer_head *bh)
1099 spin_lock(&lru_list_lock);
1100 __refile_buffer(bh);
1101 spin_unlock(&lru_list_lock);
1105 * Release a buffer head
1107 void __brelse(struct buffer_head * buf)
1109 if (atomic_read(&buf->b_count)) {
1110 atomic_dec(&buf->b_count);
1111 return;
1113 printk("VFS: brelse: Trying to free free buffer\n");
1117 * bforget() is like brelse(), except it puts the buffer on the
1118 * free list if it can.. We can NOT free the buffer if:
1119 * - there are other users of it
1120 * - it is locked and thus can have active IO
1122 void __bforget(struct buffer_head * buf)
1124 /* grab the lru lock here to block bdflush. */
1125 spin_lock(&lru_list_lock);
1126 write_lock(&hash_table_lock);
1127 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1128 goto in_use;
1129 __hash_unlink(buf);
1130 remove_inode_queue(buf);
1131 write_unlock(&hash_table_lock);
1132 __remove_from_lru_list(buf, buf->b_list);
1133 spin_unlock(&lru_list_lock);
1134 put_last_free(buf);
1135 return;
1137 in_use:
1138 write_unlock(&hash_table_lock);
1139 spin_unlock(&lru_list_lock);
1143 * bread() reads a specified block and returns the buffer that contains
1144 * it. It returns NULL if the block was unreadable.
1146 struct buffer_head * bread(kdev_t dev, int block, int size)
1148 struct buffer_head * bh;
1150 bh = getblk(dev, block, size);
1151 if (buffer_uptodate(bh))
1152 return bh;
1153 ll_rw_block(READ, 1, &bh);
1154 wait_on_buffer(bh);
1155 if (buffer_uptodate(bh))
1156 return bh;
1157 brelse(bh);
1158 return NULL;
1162 * Ok, breada can be used as bread, but additionally to mark other
1163 * blocks for reading as well. End the argument list with a negative
1164 * number.
1167 #define NBUF 16
1169 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1170 unsigned int pos, unsigned int filesize)
1172 struct buffer_head * bhlist[NBUF];
1173 unsigned int blocks;
1174 struct buffer_head * bh;
1175 int index;
1176 int i, j;
1178 if (pos >= filesize)
1179 return NULL;
1181 if (block < 0)
1182 return NULL;
1184 bh = getblk(dev, block, bufsize);
1185 index = BUFSIZE_INDEX(bh->b_size);
1187 if (buffer_uptodate(bh))
1188 return(bh);
1189 else ll_rw_block(READ, 1, &bh);
1191 blocks = (filesize - pos) >> (9+index);
1193 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1194 blocks = read_ahead[MAJOR(dev)] >> index;
1195 if (blocks > NBUF)
1196 blocks = NBUF;
1198 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1200 bhlist[0] = bh;
1201 j = 1;
1202 for(i=1; i<blocks; i++) {
1203 bh = getblk(dev,block+i,bufsize);
1204 if (buffer_uptodate(bh)) {
1205 brelse(bh);
1206 break;
1208 else bhlist[j++] = bh;
1211 /* Request the read for these buffers, and then release them. */
1212 if (j>1)
1213 ll_rw_block(READA, (j-1), bhlist+1);
1214 for(i=1; i<j; i++)
1215 brelse(bhlist[i]);
1217 /* Wait for this buffer, and then continue on. */
1218 bh = bhlist[0];
1219 wait_on_buffer(bh);
1220 if (buffer_uptodate(bh))
1221 return bh;
1222 brelse(bh);
1223 return NULL;
1227 * Note: the caller should wake up the buffer_wait list if needed.
1229 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1231 if (bh->b_inode)
1232 BUG();
1233 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1234 kmem_cache_free(bh_cachep, bh);
1235 } else {
1236 bh->b_blocknr = -1;
1237 init_waitqueue_head(&bh->b_wait);
1238 nr_unused_buffer_heads++;
1239 bh->b_next_free = unused_list;
1240 bh->b_this_page = NULL;
1241 unused_list = bh;
1246 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1247 * no-buffer-head deadlock. Return NULL on failure; waiting for
1248 * buffer heads is now handled in create_buffers().
1250 static struct buffer_head * get_unused_buffer_head(int async)
1252 struct buffer_head * bh;
1254 spin_lock(&unused_list_lock);
1255 if (nr_unused_buffer_heads > NR_RESERVED) {
1256 bh = unused_list;
1257 unused_list = bh->b_next_free;
1258 nr_unused_buffer_heads--;
1259 spin_unlock(&unused_list_lock);
1260 return bh;
1262 spin_unlock(&unused_list_lock);
1264 /* This is critical. We can't swap out pages to get
1265 * more buffer heads, because the swap-out may need
1266 * more buffer-heads itself. Thus SLAB_BUFFER.
1268 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1269 memset(bh, 0, sizeof(*bh));
1270 init_waitqueue_head(&bh->b_wait);
1271 return bh;
1275 * If we need an async buffer, use the reserved buffer heads.
1277 if (async) {
1278 spin_lock(&unused_list_lock);
1279 if (unused_list) {
1280 bh = unused_list;
1281 unused_list = bh->b_next_free;
1282 nr_unused_buffer_heads--;
1283 spin_unlock(&unused_list_lock);
1284 return bh;
1286 spin_unlock(&unused_list_lock);
1288 #if 0
1290 * (Pending further analysis ...)
1291 * Ordinary (non-async) requests can use a different memory priority
1292 * to free up pages. Any swapping thus generated will use async
1293 * buffer heads.
1295 if(!async &&
1296 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1297 memset(bh, 0, sizeof(*bh));
1298 init_waitqueue_head(&bh->b_wait);
1299 return bh;
1301 #endif
1303 return NULL;
1306 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1308 bh->b_page = page;
1309 if (offset >= PAGE_SIZE)
1310 BUG();
1311 if (PageHighMem(page))
1313 * This catches illegal uses and preserves the offset:
1315 bh->b_data = (char *)(0 + offset);
1316 else
1317 bh->b_data = (char *)(page_address(page) + offset);
1321 * Create the appropriate buffers when given a page for data area and
1322 * the size of each buffer.. Use the bh->b_this_page linked list to
1323 * follow the buffers created. Return NULL if unable to create more
1324 * buffers.
1325 * The async flag is used to differentiate async IO (paging, swapping)
1326 * from ordinary buffer allocations, and only async requests are allowed
1327 * to sleep waiting for buffer heads.
1329 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1331 struct buffer_head *bh, *head;
1332 long offset;
1334 try_again:
1335 head = NULL;
1336 offset = PAGE_SIZE;
1337 while ((offset -= size) >= 0) {
1338 bh = get_unused_buffer_head(async);
1339 if (!bh)
1340 goto no_grow;
1342 bh->b_dev = B_FREE; /* Flag as unused */
1343 bh->b_this_page = head;
1344 head = bh;
1346 bh->b_state = 0;
1347 bh->b_next_free = NULL;
1348 bh->b_pprev = NULL;
1349 atomic_set(&bh->b_count, 0);
1350 bh->b_size = size;
1352 set_bh_page(bh, page, offset);
1354 bh->b_list = BUF_CLEAN;
1355 bh->b_end_io = end_buffer_io_bad;
1357 return head;
1359 * In case anything failed, we just free everything we got.
1361 no_grow:
1362 if (head) {
1363 spin_lock(&unused_list_lock);
1364 do {
1365 bh = head;
1366 head = head->b_this_page;
1367 __put_unused_buffer_head(bh);
1368 } while (head);
1369 spin_unlock(&unused_list_lock);
1371 /* Wake up any waiters ... */
1372 wake_up(&buffer_wait);
1376 * Return failure for non-async IO requests. Async IO requests
1377 * are not allowed to fail, so we have to wait until buffer heads
1378 * become available. But we don't want tasks sleeping with
1379 * partially complete buffers, so all were released above.
1381 if (!async)
1382 return NULL;
1384 /* We're _really_ low on memory. Now we just
1385 * wait for old buffer heads to become free due to
1386 * finishing IO. Since this is an async request and
1387 * the reserve list is empty, we're sure there are
1388 * async buffer heads in use.
1390 run_task_queue(&tq_disk);
1393 * Set our state for sleeping, then check again for buffer heads.
1394 * This ensures we won't miss a wake_up from an interrupt.
1396 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1397 goto try_again;
1400 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1402 struct buffer_head *head, *bh, *tail;
1403 int block;
1405 if (!PageLocked(page))
1406 BUG();
1408 * Allocate async buffer heads pointing to this page, just for I/O.
1409 * They don't show up in the buffer hash table, but they *are*
1410 * registered in page->buffers.
1412 head = create_buffers(page, size, 1);
1413 if (page->buffers)
1414 BUG();
1415 if (!head)
1416 BUG();
1417 tail = head;
1418 for (bh = head; bh; bh = bh->b_this_page) {
1419 block = *(b++);
1421 tail = bh;
1422 init_buffer(bh, end_buffer_io_async, NULL);
1423 bh->b_dev = dev;
1424 bh->b_blocknr = block;
1426 set_bit(BH_Mapped, &bh->b_state);
1428 tail->b_this_page = head;
1429 page_cache_get(page);
1430 page->buffers = head;
1431 return 0;
1434 static void unmap_buffer(struct buffer_head * bh)
1436 if (buffer_mapped(bh)) {
1437 mark_buffer_clean(bh);
1438 wait_on_buffer(bh);
1439 clear_bit(BH_Uptodate, &bh->b_state);
1440 clear_bit(BH_Mapped, &bh->b_state);
1441 clear_bit(BH_Req, &bh->b_state);
1442 clear_bit(BH_New, &bh->b_state);
1447 * discard_buffer - discard that buffer without doing any IO
1448 * @bh: buffer to discard
1450 * This function removes a buffer from all the queues, without doing
1451 * any IO, we are not interested in the contents of the buffer. This
1452 * function can block if the buffer is locked.
1454 static inline struct buffer_head *discard_buffer(struct buffer_head * bh)
1456 struct buffer_head *next;
1458 if (bh->b_dev == B_FREE)
1459 BUG();
1461 next = bh->b_this_page;
1463 unmap_buffer(bh);
1465 spin_lock(&lru_list_lock);
1466 write_lock(&hash_table_lock);
1467 spin_lock(&unused_list_lock);
1469 if (atomic_read(&bh->b_count))
1470 BUG();
1472 __hash_unlink(bh);
1473 write_unlock(&hash_table_lock);
1475 remove_inode_queue(bh);
1476 __remove_from_lru_list(bh, bh->b_list);
1477 spin_unlock(&lru_list_lock);
1479 __put_unused_buffer_head(bh);
1480 spin_unlock(&unused_list_lock);
1482 return next;
1487 * We don't have to release all buffers here, but
1488 * we have to be sure that no dirty buffer is left
1489 * and no IO is going on (no buffer is locked), because
1490 * we have truncated the file and are going to free the
1491 * blocks on-disk..
1493 int block_flushpage(struct page *page, unsigned long offset)
1495 struct buffer_head *head, *bh, *next;
1496 unsigned int curr_off = 0;
1498 if (!PageLocked(page))
1499 BUG();
1500 if (!page->buffers)
1501 return 1;
1503 head = page->buffers;
1504 bh = head;
1505 do {
1506 unsigned int next_off = curr_off + bh->b_size;
1507 next = bh->b_this_page;
1510 * is this block fully flushed?
1512 if (offset <= curr_off)
1513 unmap_buffer(bh);
1514 curr_off = next_off;
1515 bh = next;
1516 } while (bh != head);
1518 return 1;
1522 * block_destroy_buffers - Will destroy the contents of all the
1523 * buffers in this page
1524 * @page: page to examine the buffers
1526 * This function destroy all the buffers in one page without making
1527 * any IO. The function can block due to the fact that discad_bufferr
1528 * can block.
1530 void block_destroy_buffers(struct page *page)
1532 struct buffer_head *bh, *head;
1534 if (!PageLocked(page))
1535 BUG();
1536 if (!page->buffers)
1537 return;
1539 head = page->buffers;
1540 bh = head;
1541 do {
1542 /* We need to get the next buffer from discard buffer
1543 * because discard buffer can block and anybody else
1544 * can change the buffer list under our feet.
1546 bh = discard_buffer(bh);
1547 }while (bh != head);
1549 /* Wake up anyone waiting for buffer heads */
1550 wake_up(&buffer_wait);
1552 /* And free the page */
1553 page->buffers = NULL;
1554 page_cache_release(page);
1557 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1559 struct buffer_head *bh, *head, *tail;
1561 head = create_buffers(page, blocksize, 1);
1562 if (page->buffers)
1563 BUG();
1565 bh = head;
1566 do {
1567 bh->b_dev = inode->i_dev;
1568 bh->b_blocknr = 0;
1569 bh->b_end_io = end_buffer_io_bad;
1570 tail = bh;
1571 bh = bh->b_this_page;
1572 } while (bh);
1573 tail->b_this_page = head;
1574 page->buffers = head;
1575 page_cache_get(page);
1578 static void unmap_underlying_metadata(struct buffer_head * bh)
1580 struct buffer_head *old_bh;
1582 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1583 if (old_bh) {
1584 unmap_buffer(old_bh);
1585 /* Here we could run brelse or bforget. We use
1586 bforget because it will try to put the buffer
1587 in the freelist. */
1588 __bforget(old_bh);
1593 * block_write_full_page() is SMP-safe - currently it's still
1594 * being called with the kernel lock held, but the code is ready.
1596 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1598 int err, i, need_balance_dirty = 0;
1599 unsigned long block;
1600 struct buffer_head *bh, *head;
1602 if (!PageLocked(page))
1603 BUG();
1605 if (!page->buffers)
1606 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1607 head = page->buffers;
1609 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1611 bh = head;
1612 i = 0;
1613 do {
1615 * If the buffer isn't up-to-date, we can't be sure
1616 * that the buffer has been initialized with the proper
1617 * block number information etc..
1619 * Leave it to the low-level FS to make all those
1620 * decisions (block #0 may actually be a valid block)
1622 bh->b_end_io = end_buffer_io_sync;
1623 if (!buffer_mapped(bh)) {
1624 err = get_block(inode, block, bh, 1);
1625 if (err)
1626 goto out;
1627 if (buffer_new(bh))
1628 unmap_underlying_metadata(bh);
1630 set_bit(BH_Uptodate, &bh->b_state);
1631 if (!atomic_set_buffer_dirty(bh)) {
1632 __mark_dirty(bh, 0);
1633 need_balance_dirty = 1;
1636 bh = bh->b_this_page;
1637 block++;
1638 } while (bh != head);
1640 if (need_balance_dirty)
1641 balance_dirty(bh->b_dev);
1643 SetPageUptodate(page);
1644 return 0;
1645 out:
1646 ClearPageUptodate(page);
1647 return err;
1650 static int __block_prepare_write(struct inode *inode, struct page *page,
1651 unsigned from, unsigned to, get_block_t *get_block)
1653 unsigned block_start, block_end;
1654 unsigned long block;
1655 int err = 0;
1656 unsigned blocksize, bbits;
1657 struct buffer_head *bh, *head, *wait[MAX_BUF_PER_PAGE], **wait_bh=wait;
1658 char *kaddr = (char *)kmap(page);
1660 blocksize = inode->i_sb->s_blocksize;
1661 if (!page->buffers)
1662 create_empty_buffers(page, inode, blocksize);
1663 head = page->buffers;
1665 bbits = inode->i_sb->s_blocksize_bits;
1666 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1668 for(bh = head, block_start = 0; bh != head || !block_start;
1669 block++, block_start=block_end, bh = bh->b_this_page) {
1670 if (!bh)
1671 BUG();
1672 block_end = block_start+blocksize;
1673 if (block_end <= from)
1674 continue;
1675 if (block_start >= to)
1676 break;
1677 bh->b_end_io = end_buffer_io_sync;
1678 if (!buffer_mapped(bh)) {
1679 err = get_block(inode, block, bh, 1);
1680 if (err)
1681 goto out;
1682 if (buffer_new(bh)) {
1683 unmap_underlying_metadata(bh);
1684 if (block_end > to)
1685 memset(kaddr+to, 0, block_end-to);
1686 if (block_start < from)
1687 memset(kaddr+block_start, 0, from-block_start);
1688 continue;
1691 if (!buffer_uptodate(bh) &&
1692 (block_start < from || block_end > to)) {
1693 ll_rw_block(READ, 1, &bh);
1694 *wait_bh++=bh;
1698 * If we issued read requests - let them complete.
1700 while(wait_bh > wait) {
1701 wait_on_buffer(*--wait_bh);
1702 err = -EIO;
1703 if (!buffer_uptodate(*wait_bh))
1704 goto out;
1706 return 0;
1707 out:
1708 return err;
1711 static int __block_commit_write(struct inode *inode, struct page *page,
1712 unsigned from, unsigned to)
1714 unsigned block_start, block_end;
1715 int partial = 0, need_balance_dirty = 0;
1716 unsigned blocksize;
1717 struct buffer_head *bh, *head;
1719 blocksize = inode->i_sb->s_blocksize;
1721 for(bh = head = page->buffers, block_start = 0;
1722 bh != head || !block_start;
1723 block_start=block_end, bh = bh->b_this_page) {
1724 block_end = block_start + blocksize;
1725 if (block_end <= from || block_start >= to) {
1726 if (!buffer_uptodate(bh))
1727 partial = 1;
1728 } else {
1729 set_bit(BH_Uptodate, &bh->b_state);
1730 if (!atomic_set_buffer_dirty(bh)) {
1731 buffer_insert_inode_queue(bh, inode);
1732 __mark_dirty(bh, 0);
1733 need_balance_dirty = 1;
1738 if (need_balance_dirty)
1739 balance_dirty(bh->b_dev);
1741 * is this a partial write that happened to make all buffers
1742 * uptodate then we can optimize away a bogus readpage() for
1743 * the next read(). Here we 'discover' wether the page went
1744 * uptodate as a result of this (potentially partial) write.
1746 if (!partial)
1747 SetPageUptodate(page);
1748 return 0;
1752 * Generic "read page" function for block devices that have the normal
1753 * get_block functionality. This is most of the block device filesystems.
1754 * Reads the page asynchronously --- the unlock_buffer() and
1755 * mark_buffer_uptodate() functions propagate buffer state into the
1756 * page struct once IO has completed.
1758 int block_read_full_page(struct page *page, get_block_t *get_block)
1760 struct inode *inode = (struct inode*)page->mapping->host;
1761 unsigned long iblock, lblock;
1762 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1763 unsigned int blocksize, blocks;
1764 unsigned long kaddr = 0;
1765 int nr, i;
1767 if (!PageLocked(page))
1768 PAGE_BUG(page);
1769 blocksize = inode->i_sb->s_blocksize;
1770 if (!page->buffers)
1771 create_empty_buffers(page, inode, blocksize);
1772 head = page->buffers;
1774 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1775 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1776 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1777 bh = head;
1778 nr = 0;
1779 i = 0;
1781 do {
1782 if (buffer_uptodate(bh))
1783 continue;
1785 if (!buffer_mapped(bh)) {
1786 if (iblock < lblock)
1787 get_block(inode, iblock, bh, 0);
1788 if (!buffer_mapped(bh)) {
1789 if (!kaddr)
1790 kaddr = kmap(page);
1791 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1792 set_bit(BH_Uptodate, &bh->b_state);
1793 continue;
1797 init_buffer(bh, end_buffer_io_async, NULL);
1798 atomic_inc(&bh->b_count);
1799 arr[nr] = bh;
1800 nr++;
1801 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1803 if (nr) {
1804 if (Page_Uptodate(page))
1805 BUG();
1806 ll_rw_block(READ, nr, arr);
1807 } else {
1809 * all buffers are uptodate - we can set the page
1810 * uptodate as well.
1812 SetPageUptodate(page);
1813 UnlockPage(page);
1815 if (kaddr)
1816 kunmap(page);
1817 return 0;
1821 * For moronic filesystems that do not allow holes in file.
1822 * We may have to extend the file.
1825 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1827 struct address_space *mapping = page->mapping;
1828 struct inode *inode = (struct inode*)mapping->host;
1829 struct page *new_page;
1830 unsigned long pgpos;
1831 long status;
1832 unsigned zerofrom;
1833 unsigned blocksize = inode->i_sb->s_blocksize;
1834 char *kaddr;
1836 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1837 status = -ENOMEM;
1838 new_page = grab_cache_page(mapping, pgpos);
1839 if (!new_page)
1840 goto out;
1841 /* we might sleep */
1842 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1843 UnlockPage(new_page);
1844 page_cache_release(new_page);
1845 continue;
1847 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1848 if (zerofrom & (blocksize-1)) {
1849 *bytes |= (blocksize-1);
1850 (*bytes)++;
1852 status = __block_prepare_write(inode, new_page, zerofrom,
1853 PAGE_CACHE_SIZE, get_block);
1854 if (status)
1855 goto out_unmap;
1856 kaddr = (char*)page_address(page);
1857 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1858 __block_commit_write(inode, new_page, zerofrom, to);
1859 kunmap(new_page);
1860 UnlockPage(new_page);
1861 page_cache_release(new_page);
1864 if (page->index < pgpos) {
1865 /* completely inside the area */
1866 zerofrom = offset;
1867 } else {
1868 /* page covers the boundary, find the boundary offset */
1869 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1871 /* if we will expand the thing last block will be filled */
1872 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1873 *bytes |= (blocksize-1);
1874 (*bytes)++;
1877 /* starting below the boundary? Nothing to zero out */
1878 if (offset <= zerofrom)
1879 zerofrom = offset;
1881 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1882 if (status)
1883 goto out1;
1884 kaddr = (char*)page_address(page);
1885 if (zerofrom < offset) {
1886 memset(kaddr+zerofrom, 0, offset-zerofrom);
1887 __block_commit_write(inode, page, zerofrom, offset);
1889 return 0;
1890 out1:
1891 ClearPageUptodate(page);
1892 kunmap(page);
1893 return status;
1895 out_unmap:
1896 ClearPageUptodate(new_page);
1897 kunmap(new_page);
1898 UnlockPage(new_page);
1899 page_cache_release(new_page);
1900 out:
1901 return status;
1904 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1905 get_block_t *get_block)
1907 struct inode *inode = (struct inode*)page->mapping->host;
1908 int err = __block_prepare_write(inode, page, from, to, get_block);
1909 if (err) {
1910 ClearPageUptodate(page);
1911 kunmap(page);
1913 return err;
1916 int generic_commit_write(struct file *file, struct page *page,
1917 unsigned from, unsigned to)
1919 struct inode *inode = (struct inode*)page->mapping->host;
1920 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1921 __block_commit_write(inode,page,from,to);
1922 kunmap(page);
1923 if (pos > inode->i_size)
1924 inode->i_size = pos;
1925 return 0;
1928 int block_write_full_page(struct page *page, get_block_t *get_block)
1930 struct inode *inode = (struct inode*)page->mapping->host;
1931 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1932 unsigned offset;
1933 int err;
1935 /* easy case */
1936 if (page->index < end_index)
1937 return __block_write_full_page(inode, page, get_block);
1939 /* things got complicated... */
1940 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1941 /* OK, are we completely out? */
1942 if (page->index >= end_index+1 || !offset)
1943 return -EIO;
1944 /* Sigh... will have to work, then... */
1945 err = __block_prepare_write(inode, page, 0, offset, get_block);
1946 if (!err) {
1947 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1948 __block_commit_write(inode,page,0,offset);
1949 done:
1950 kunmap(page);
1951 return err;
1953 ClearPageUptodate(page);
1954 goto done;
1957 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1959 struct buffer_head tmp;
1960 struct inode *inode = (struct inode*)mapping->host;
1961 tmp.b_state = 0;
1962 tmp.b_blocknr = 0;
1963 get_block(inode, block, &tmp, 0);
1964 return tmp.b_blocknr;
1968 * IO completion routine for a buffer_head being used for kiobuf IO: we
1969 * can't dispatch the kiobuf callback until io_count reaches 0.
1972 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1974 struct kiobuf *kiobuf;
1976 mark_buffer_uptodate(bh, uptodate);
1978 kiobuf = bh->b_kiobuf;
1979 unlock_buffer(bh);
1980 end_kio_request(kiobuf, uptodate);
1985 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1986 * for them to complete. Clean up the buffer_heads afterwards.
1989 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1991 int iosize;
1992 int i;
1993 struct buffer_head *tmp;
1995 struct task_struct *tsk = current;
1996 DECLARE_WAITQUEUE(wait, tsk);
1998 if (rw == WRITE)
1999 rw = WRITERAW;
2000 ll_rw_block(rw, nr, bh);
2002 iosize = 0;
2003 spin_lock(&unused_list_lock);
2005 for (i = nr; --i >= 0; ) {
2006 iosize += size;
2007 tmp = bh[i];
2008 if (buffer_locked(tmp)) {
2009 spin_unlock(&unused_list_lock);
2010 wait_on_buffer(tmp);
2011 spin_lock(&unused_list_lock);
2014 if (!buffer_uptodate(tmp)) {
2015 /* We are traversing bh'es in reverse order so
2016 clearing iosize on error calculates the
2017 amount of IO before the first error. */
2018 iosize = 0;
2020 __put_unused_buffer_head(tmp);
2023 spin_unlock(&unused_list_lock);
2024 wake_up(&buffer_wait);
2026 return iosize;
2030 * Start I/O on a physical range of kernel memory, defined by a vector
2031 * of kiobuf structs (much like a user-space iovec list).
2033 * The kiobuf must already be locked for IO. IO is submitted
2034 * asynchronously: you need to check page->locked, page->uptodate, and
2035 * maybe wait on page->wait.
2037 * It is up to the caller to make sure that there are enough blocks
2038 * passed in to completely map the iobufs to disk.
2041 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2042 kdev_t dev, unsigned long b[], int size)
2044 int err;
2045 int length;
2046 int transferred;
2047 int i;
2048 int bufind;
2049 int pageind;
2050 int bhind;
2051 int offset;
2052 unsigned long blocknr;
2053 struct kiobuf * iobuf = NULL;
2054 struct page * map;
2055 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2057 if (!nr)
2058 return 0;
2061 * First, do some alignment and validity checks
2063 for (i = 0; i < nr; i++) {
2064 iobuf = iovec[i];
2065 if ((iobuf->offset & (size-1)) ||
2066 (iobuf->length & (size-1)))
2067 return -EINVAL;
2068 if (!iobuf->nr_pages)
2069 panic("brw_kiovec: iobuf not initialised");
2073 * OK to walk down the iovec doing page IO on each page we find.
2075 bufind = bhind = transferred = err = 0;
2076 for (i = 0; i < nr; i++) {
2077 iobuf = iovec[i];
2078 offset = iobuf->offset;
2079 length = iobuf->length;
2080 iobuf->errno = 0;
2082 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2083 map = iobuf->maplist[pageind];
2084 if (!map) {
2085 err = -EFAULT;
2086 goto error;
2089 while (length > 0) {
2090 blocknr = b[bufind++];
2091 tmp = get_unused_buffer_head(0);
2092 if (!tmp) {
2093 err = -ENOMEM;
2094 goto error;
2097 tmp->b_dev = B_FREE;
2098 tmp->b_size = size;
2099 set_bh_page(tmp, map, offset);
2100 tmp->b_this_page = tmp;
2102 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
2103 tmp->b_dev = dev;
2104 tmp->b_blocknr = blocknr;
2105 tmp->b_state = 1 << BH_Mapped;
2106 tmp->b_kiobuf = iobuf;
2108 if (rw == WRITE) {
2109 set_bit(BH_Uptodate, &tmp->b_state);
2110 set_bit(BH_Dirty, &tmp->b_state);
2113 bh[bhind++] = tmp;
2114 length -= size;
2115 offset += size;
2117 atomic_inc(&iobuf->io_count);
2120 * Start the IO if we have got too much
2122 if (bhind >= KIO_MAX_SECTORS) {
2123 err = do_kio(rw, bhind, bh, size);
2124 if (err >= 0)
2125 transferred += err;
2126 else
2127 goto finished;
2128 bhind = 0;
2131 if (offset >= PAGE_SIZE) {
2132 offset = 0;
2133 break;
2135 } /* End of block loop */
2136 } /* End of page loop */
2137 } /* End of iovec loop */
2139 /* Is there any IO still left to submit? */
2140 if (bhind) {
2141 err = do_kio(rw, bhind, bh, size);
2142 if (err >= 0)
2143 transferred += err;
2144 else
2145 goto finished;
2148 finished:
2149 if (transferred)
2150 return transferred;
2151 return err;
2153 error:
2154 /* We got an error allocating the bh'es. Just free the current
2155 buffer_heads and exit. */
2156 spin_lock(&unused_list_lock);
2157 for (i = bhind; --i >= 0; ) {
2158 __put_unused_buffer_head(bh[bhind]);
2160 spin_unlock(&unused_list_lock);
2161 wake_up(&buffer_wait);
2163 goto finished;
2167 * Start I/O on a page.
2168 * This function expects the page to be locked and may return
2169 * before I/O is complete. You then have to check page->locked,
2170 * page->uptodate, and maybe wait on page->wait.
2172 * brw_page() is SMP-safe, although it's being called with the
2173 * kernel lock held - but the code is ready.
2175 * FIXME: we need a swapper_inode->get_block function to remove
2176 * some of the bmap kludges and interface ugliness here.
2178 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2180 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2181 int nr, fresh /* temporary debugging flag */, block;
2183 if (!PageLocked(page))
2184 panic("brw_page: page not locked for I/O");
2185 // ClearPageError(page);
2187 * We pretty much rely on the page lock for this, because
2188 * create_page_buffers() might sleep.
2190 fresh = 0;
2191 if (!page->buffers) {
2192 create_page_buffers(rw, page, dev, b, size);
2193 fresh = 1;
2195 if (!page->buffers)
2196 BUG();
2198 head = page->buffers;
2199 bh = head;
2200 nr = 0;
2201 do {
2202 block = *(b++);
2204 if (fresh && (atomic_read(&bh->b_count) != 0))
2205 BUG();
2206 if (rw == READ) {
2207 if (!fresh)
2208 BUG();
2209 if (!buffer_uptodate(bh)) {
2210 arr[nr++] = bh;
2211 atomic_inc(&bh->b_count);
2213 } else { /* WRITE */
2214 if (!bh->b_blocknr) {
2215 if (!block)
2216 BUG();
2217 bh->b_blocknr = block;
2218 } else {
2219 if (!block)
2220 BUG();
2222 set_bit(BH_Uptodate, &bh->b_state);
2223 set_bit(BH_Dirty, &bh->b_state);
2224 arr[nr++] = bh;
2225 atomic_inc(&bh->b_count);
2227 bh = bh->b_this_page;
2228 } while (bh != head);
2229 if ((rw == READ) && nr) {
2230 if (Page_Uptodate(page))
2231 BUG();
2232 ll_rw_block(rw, nr, arr);
2233 } else {
2234 if (!nr && rw == READ) {
2235 SetPageUptodate(page);
2236 UnlockPage(page);
2238 if (nr && (rw == WRITE))
2239 ll_rw_block(rw, nr, arr);
2241 return 0;
2244 int block_symlink(struct inode *inode, const char *symname, int len)
2246 struct address_space *mapping = inode->i_mapping;
2247 struct page *page = grab_cache_page(mapping, 0);
2248 int err = -ENOMEM;
2249 char *kaddr;
2251 if (!page)
2252 goto fail;
2253 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2254 if (err)
2255 goto fail_map;
2256 kaddr = (char*)page_address(page);
2257 memcpy(kaddr, symname, len-1);
2258 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2260 * Notice that we are _not_ going to block here - end of page is
2261 * unmapped, so this will only try to map the rest of page, see
2262 * that it is unmapped (typically even will not look into inode -
2263 * ->i_size will be enough for everything) and zero it out.
2264 * OTOH it's obviously correct and should make the page up-to-date.
2266 err = mapping->a_ops->readpage(NULL, page);
2267 wait_on_page(page);
2268 page_cache_release(page);
2269 if (err < 0)
2270 goto fail;
2271 mark_inode_dirty(inode);
2272 return 0;
2273 fail_map:
2274 UnlockPage(page);
2275 page_cache_release(page);
2276 fail:
2277 return err;
2281 * Try to increase the number of buffers available: the size argument
2282 * is used to determine what kind of buffers we want.
2284 static int grow_buffers(int size)
2286 struct page * page;
2287 struct buffer_head *bh, *tmp;
2288 struct buffer_head * insert_point;
2289 int isize;
2291 if ((size & 511) || (size > PAGE_SIZE)) {
2292 printk("VFS: grow_buffers: size = %d\n",size);
2293 return 0;
2296 page = alloc_page(GFP_BUFFER);
2297 if (!page)
2298 goto out;
2299 bh = create_buffers(page, size, 0);
2300 if (!bh)
2301 goto no_buffer_head;
2303 isize = BUFSIZE_INDEX(size);
2305 spin_lock(&free_list[isize].lock);
2306 insert_point = free_list[isize].list;
2307 tmp = bh;
2308 while (1) {
2309 if (insert_point) {
2310 tmp->b_next_free = insert_point->b_next_free;
2311 tmp->b_prev_free = insert_point;
2312 insert_point->b_next_free->b_prev_free = tmp;
2313 insert_point->b_next_free = tmp;
2314 } else {
2315 tmp->b_prev_free = tmp;
2316 tmp->b_next_free = tmp;
2318 insert_point = tmp;
2319 if (tmp->b_this_page)
2320 tmp = tmp->b_this_page;
2321 else
2322 break;
2324 tmp->b_this_page = bh;
2325 free_list[isize].list = bh;
2326 spin_unlock(&free_list[isize].lock);
2328 page->buffers = bh;
2329 lru_cache_add(page);
2330 atomic_inc(&buffermem_pages);
2331 return 1;
2333 no_buffer_head:
2334 page_cache_release(page);
2335 out:
2336 return 0;
2340 * Can the buffer be thrown out?
2342 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2343 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2346 * Sync all the buffers on one page..
2348 * If we have old buffers that are locked, we'll
2349 * wait on them, but we won't wait on the new ones
2350 * we're writing out now.
2352 * This all is required so that we can free up memory
2353 * later.
2355 static int sync_page_buffers(struct buffer_head *bh, int wait)
2357 struct buffer_head * tmp = bh;
2359 do {
2360 struct buffer_head *p = tmp;
2361 tmp = tmp->b_this_page;
2362 if (buffer_locked(p)) {
2363 if (wait)
2364 __wait_on_buffer(p);
2365 } else if (buffer_dirty(p))
2366 ll_rw_block(WRITE, 1, &p);
2367 } while (tmp != bh);
2369 do {
2370 struct buffer_head *p = tmp;
2371 tmp = tmp->b_this_page;
2372 if (buffer_busy(p))
2373 return 0;
2374 } while (tmp != bh);
2376 /* Success. Now try_to_free_buffers can free the page. */
2377 return 1;
2381 * try_to_free_buffers() checks if all the buffers on this particular page
2382 * are unused, and free's the page if so.
2384 * Wake up bdflush() if this fails - if we're running low on memory due
2385 * to dirty buffers, we need to flush them out as quickly as possible.
2387 * NOTE: There are quite a number of ways that threads of control can
2388 * obtain a reference to a buffer head within a page. So we must
2389 * lock out all of these paths to cleanly toss the page.
2391 int try_to_free_buffers(struct page * page, int wait)
2393 struct buffer_head * tmp, * bh = page->buffers;
2394 int index = BUFSIZE_INDEX(bh->b_size);
2396 again:
2397 spin_lock(&lru_list_lock);
2398 write_lock(&hash_table_lock);
2399 spin_lock(&free_list[index].lock);
2400 tmp = bh;
2401 do {
2402 struct buffer_head *p = tmp;
2404 tmp = tmp->b_this_page;
2405 if (buffer_busy(p))
2406 goto busy_buffer_page;
2407 } while (tmp != bh);
2409 spin_lock(&unused_list_lock);
2410 tmp = bh;
2411 do {
2412 struct buffer_head * p = tmp;
2413 tmp = tmp->b_this_page;
2415 /* The buffer can be either on the regular
2416 * queues or on the free list..
2418 if (p->b_dev != B_FREE) {
2419 remove_inode_queue(p);
2420 __remove_from_queues(p);
2422 else
2423 __remove_from_free_list(p, index);
2424 __put_unused_buffer_head(p);
2425 } while (tmp != bh);
2426 spin_unlock(&unused_list_lock);
2428 /* Wake up anyone waiting for buffer heads */
2429 wake_up(&buffer_wait);
2431 /* And free the page */
2432 page->buffers = NULL;
2433 page_cache_release(page);
2434 spin_unlock(&free_list[index].lock);
2435 write_unlock(&hash_table_lock);
2436 spin_unlock(&lru_list_lock);
2437 return 1;
2439 busy_buffer_page:
2440 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2441 spin_unlock(&free_list[index].lock);
2442 write_unlock(&hash_table_lock);
2443 spin_unlock(&lru_list_lock);
2444 if (sync_page_buffers(bh, wait))
2445 goto again;
2446 return 0;
2449 /* ================== Debugging =================== */
2451 void show_buffers(void)
2453 #ifdef CONFIG_SMP
2454 struct buffer_head * bh;
2455 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2456 int protected = 0;
2457 int nlist;
2458 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2459 #endif
2461 printk("Buffer memory: %6dkB\n",
2462 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2464 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2465 if (!spin_trylock(&lru_list_lock))
2466 return;
2467 for(nlist = 0; nlist < NR_LIST; nlist++) {
2468 found = locked = dirty = used = lastused = protected = 0;
2469 bh = lru_list[nlist];
2470 if(!bh) continue;
2472 do {
2473 found++;
2474 if (buffer_locked(bh))
2475 locked++;
2476 if (buffer_protected(bh))
2477 protected++;
2478 if (buffer_dirty(bh))
2479 dirty++;
2480 if (atomic_read(&bh->b_count))
2481 used++, lastused = found;
2482 bh = bh->b_next_free;
2483 } while (bh != lru_list[nlist]);
2485 int tmp = nr_buffers_type[nlist];
2486 if (found != tmp)
2487 printk("%9s: BUG -> found %d, reported %d\n",
2488 buf_types[nlist], found, tmp);
2490 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2491 "%d locked, %d protected, %d dirty\n",
2492 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2493 used, lastused, locked, protected, dirty);
2495 spin_unlock(&lru_list_lock);
2496 #endif
2499 /* ===================== Init ======================= */
2502 * allocate the hash table and init the free list
2503 * Use gfp() for the hash table to decrease TLB misses, use
2504 * SLAB cache for buffer heads.
2506 void __init buffer_init(unsigned long mempages)
2508 int order, i;
2509 unsigned int nr_hash;
2511 /* The buffer cache hash table is less important these days,
2512 * trim it a bit.
2514 mempages >>= 14;
2516 mempages *= sizeof(struct buffer_head *);
2518 for (order = 0; (1 << order) < mempages; order++)
2521 /* try to allocate something until we get it or we're asking
2522 for something that is really too small */
2524 do {
2525 unsigned long tmp;
2527 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2528 bh_hash_mask = (nr_hash - 1);
2530 tmp = nr_hash;
2531 bh_hash_shift = 0;
2532 while((tmp >>= 1UL) != 0UL)
2533 bh_hash_shift++;
2535 hash_table = (struct buffer_head **)
2536 __get_free_pages(GFP_ATOMIC, order);
2537 } while (hash_table == NULL && --order > 0);
2538 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2539 nr_hash, order, (PAGE_SIZE << order));
2541 if (!hash_table)
2542 panic("Failed to allocate buffer hash table\n");
2544 /* Setup hash chains. */
2545 for(i = 0; i < nr_hash; i++)
2546 hash_table[i] = NULL;
2548 /* Setup free lists. */
2549 for(i = 0; i < NR_SIZES; i++) {
2550 free_list[i].list = NULL;
2551 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2554 /* Setup lru lists. */
2555 for(i = 0; i < NR_LIST; i++)
2556 lru_list[i] = NULL;
2558 bh_cachep = kmem_cache_create("buffer_head",
2559 sizeof(struct buffer_head),
2561 SLAB_HWCACHE_ALIGN, NULL, NULL);
2562 if(!bh_cachep)
2563 panic("Cannot create buffer head SLAB cache\n");
2567 /* ====================== bdflush support =================== */
2569 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2570 * response to dirty buffers. Once this process is activated, we write back
2571 * a limited number of buffers to the disks and then go back to sleep again.
2573 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2574 struct task_struct *bdflush_tsk = 0;
2576 void wakeup_bdflush(int block)
2578 DECLARE_WAITQUEUE(wait, current);
2580 if (current == bdflush_tsk)
2581 return;
2583 if (!block) {
2584 wake_up_process(bdflush_tsk);
2585 return;
2588 /* kflushd can wakeup us before we have a chance to
2589 go to sleep so we must be smart in handling
2590 this wakeup event from kflushd to avoid deadlocking in SMP
2591 (we are not holding any lock anymore in these two paths). */
2592 __set_current_state(TASK_UNINTERRUPTIBLE);
2593 add_wait_queue(&bdflush_done, &wait);
2595 wake_up_process(bdflush_tsk);
2596 schedule();
2598 remove_wait_queue(&bdflush_done, &wait);
2599 __set_current_state(TASK_RUNNING);
2602 /* This is the _only_ function that deals with flushing async writes
2603 to disk.
2604 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2605 as all dirty buffers lives _only_ in the DIRTY lru list.
2606 As we never browse the LOCKED and CLEAN lru lists they are infact
2607 completly useless. */
2608 static int flush_dirty_buffers(int check_flushtime)
2610 struct buffer_head * bh, *next;
2611 int flushed = 0, i;
2613 restart:
2614 spin_lock(&lru_list_lock);
2615 bh = lru_list[BUF_DIRTY];
2616 if (!bh)
2617 goto out_unlock;
2618 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2619 next = bh->b_next_free;
2621 if (!buffer_dirty(bh)) {
2622 __refile_buffer(bh);
2623 continue;
2625 if (buffer_locked(bh))
2626 continue;
2628 if (check_flushtime) {
2629 /* The dirty lru list is chronologically ordered so
2630 if the current bh is not yet timed out,
2631 then also all the following bhs
2632 will be too young. */
2633 if (time_before(jiffies, bh->b_flushtime))
2634 goto out_unlock;
2635 } else {
2636 if (++flushed > bdf_prm.b_un.ndirty)
2637 goto out_unlock;
2640 /* OK, now we are committed to write it out. */
2641 atomic_inc(&bh->b_count);
2642 spin_unlock(&lru_list_lock);
2643 ll_rw_block(WRITE, 1, &bh);
2644 atomic_dec(&bh->b_count);
2646 if (current->need_resched)
2647 schedule();
2648 goto restart;
2650 out_unlock:
2651 spin_unlock(&lru_list_lock);
2653 return flushed;
2657 * Here we attempt to write back old buffers. We also try to flush inodes
2658 * and supers as well, since this function is essentially "update", and
2659 * otherwise there would be no way of ensuring that these quantities ever
2660 * get written back. Ideally, we would have a timestamp on the inodes
2661 * and superblocks so that we could write back only the old ones as well
2664 static int sync_old_buffers(void)
2666 lock_kernel();
2667 sync_supers(0);
2668 sync_inodes(0);
2669 unlock_kernel();
2671 flush_dirty_buffers(1);
2672 /* must really sync all the active I/O request to disk here */
2673 run_task_queue(&tq_disk);
2674 return 0;
2677 int block_sync_page(struct page *page)
2679 run_task_queue(&tq_disk);
2680 return 0;
2683 /* This is the interface to bdflush. As we get more sophisticated, we can
2684 * pass tuning parameters to this "process", to adjust how it behaves.
2685 * We would want to verify each parameter, however, to make sure that it
2686 * is reasonable. */
2688 asmlinkage long sys_bdflush(int func, long data)
2690 if (!capable(CAP_SYS_ADMIN))
2691 return -EPERM;
2693 if (func == 1) {
2694 /* do_exit directly and let kupdate to do its work alone. */
2695 do_exit(0);
2696 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2697 a syscall that doesn't care about the current mm context. */
2698 int error;
2699 struct mm_struct *user_mm;
2702 * bdflush will spend all of it's time in kernel-space,
2703 * without touching user-space, so we can switch it into
2704 * 'lazy TLB mode' to reduce the cost of context-switches
2705 * to and from bdflush.
2707 user_mm = start_lazy_tlb();
2708 error = sync_old_buffers();
2709 end_lazy_tlb(user_mm);
2710 return error;
2711 #endif
2714 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2715 if (func >= 2) {
2716 int i = (func-2) >> 1;
2717 if (i >= 0 && i < N_PARAM) {
2718 if ((func & 1) == 0)
2719 return put_user(bdf_prm.data[i], (int*)data);
2721 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2722 bdf_prm.data[i] = data;
2723 return 0;
2726 return -EINVAL;
2729 /* Having func 0 used to launch the actual bdflush and then never
2730 * return (unless explicitly killed). We return zero here to
2731 * remain semi-compatible with present update(8) programs.
2733 return 0;
2737 * This is the actual bdflush daemon itself. It used to be started from
2738 * the syscall above, but now we launch it ourselves internally with
2739 * kernel_thread(...) directly after the first thread in init/main.c
2741 int bdflush(void *sem)
2743 struct task_struct *tsk = current;
2744 int flushed;
2746 * We have a bare-bones task_struct, and really should fill
2747 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2748 * display semi-sane things. Not real crucial though...
2751 tsk->session = 1;
2752 tsk->pgrp = 1;
2753 strcpy(tsk->comm, "kflushd");
2754 bdflush_tsk = tsk;
2756 /* avoid getting signals */
2757 spin_lock_irq(&tsk->sigmask_lock);
2758 flush_signals(tsk);
2759 sigfillset(&tsk->blocked);
2760 recalc_sigpending(tsk);
2761 spin_unlock_irq(&tsk->sigmask_lock);
2763 up((struct semaphore *)sem);
2765 for (;;) {
2766 CHECK_EMERGENCY_SYNC
2768 flushed = flush_dirty_buffers(0);
2770 /* If wakeup_bdflush will wakeup us
2771 after our bdflush_done wakeup, then
2772 we must make sure to not sleep
2773 in schedule_timeout otherwise
2774 wakeup_bdflush may wait for our
2775 bdflush_done wakeup that would never arrive
2776 (as we would be sleeping) and so it would
2777 deadlock in SMP. */
2778 __set_current_state(TASK_INTERRUPTIBLE);
2779 wake_up(&bdflush_done);
2781 * If there are still a lot of dirty buffers around,
2782 * skip the sleep and flush some more. Otherwise, we
2783 * go to sleep waiting a wakeup.
2785 if (!flushed || balance_dirty_state(NODEV) < 0)
2786 schedule();
2787 /* Remember to mark us as running otherwise
2788 the next schedule will block. */
2789 __set_current_state(TASK_RUNNING);
2794 * This is the kernel update daemon. It was used to live in userspace
2795 * but since it's need to run safely we want it unkillable by mistake.
2796 * You don't need to change your userspace configuration since
2797 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2799 int kupdate(void *sem)
2801 struct task_struct * tsk = current;
2802 int interval;
2804 tsk->session = 1;
2805 tsk->pgrp = 1;
2806 strcpy(tsk->comm, "kupdate");
2808 /* sigstop and sigcont will stop and wakeup kupdate */
2809 spin_lock_irq(&tsk->sigmask_lock);
2810 sigfillset(&tsk->blocked);
2811 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2812 recalc_sigpending(tsk);
2813 spin_unlock_irq(&tsk->sigmask_lock);
2815 up((struct semaphore *)sem);
2817 for (;;) {
2818 /* update interval */
2819 interval = bdf_prm.b_un.interval;
2820 if (interval) {
2821 tsk->state = TASK_INTERRUPTIBLE;
2822 schedule_timeout(interval);
2823 } else {
2824 stop_kupdate:
2825 tsk->state = TASK_STOPPED;
2826 schedule(); /* wait for SIGCONT */
2828 /* check for sigstop */
2829 if (signal_pending(tsk)) {
2830 int stopped = 0;
2831 spin_lock_irq(&tsk->sigmask_lock);
2832 if (sigismember(&tsk->signal, SIGSTOP)) {
2833 sigdelset(&tsk->signal, SIGSTOP);
2834 stopped = 1;
2836 recalc_sigpending(tsk);
2837 spin_unlock_irq(&tsk->sigmask_lock);
2838 if (stopped)
2839 goto stop_kupdate;
2841 #ifdef DEBUG
2842 printk("kupdate() activated...\n");
2843 #endif
2844 sync_old_buffers();
2848 static int __init bdflush_init(void)
2850 DECLARE_MUTEX_LOCKED(sem);
2851 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2852 down(&sem);
2853 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2854 down(&sem);
2855 return 0;
2858 module_init(bdflush_init)