Import 2.4.0-test6pre2
[davej-history.git] / fs / buffer.c
blobad0a04e686425e005e23887c42c8ba88e551c7d8
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
53 #define NR_SIZES 7
54 static char buffersize_index[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 6};
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
72 * Hash table gook..
74 static unsigned int bh_hash_mask;
75 static unsigned int bh_hash_shift;
76 static struct buffer_head **hash_table;
77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
79 static struct buffer_head *lru_list[NR_LIST];
80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
81 static int nr_buffers_type[NR_LIST];
82 static unsigned long size_buffers_type[NR_LIST];
84 static struct buffer_head * unused_list;
85 static int nr_unused_buffer_heads;
86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
89 struct bh_free_head {
90 struct buffer_head *list;
91 spinlock_t lock;
93 static struct bh_free_head free_list[NR_SIZES];
95 static int grow_buffers(int size);
96 static void __refile_buffer(struct buffer_head *);
98 /* This is used by some architectures to estimate available memory. */
99 atomic_t buffermem_pages = ATOMIC_INIT(0);
101 /* Here is the parameter block for the bdflush process. If you add or
102 * remove any of the parameters, make sure to update kernel/sysctl.c.
105 #define N_PARAM 9
107 /* The dummy values in this structure are left in there for compatibility
108 * with old programs that play with the /proc entries.
110 union bdflush_param {
111 struct {
112 int nfract; /* Percentage of buffer cache dirty to
113 activate bdflush */
114 int ndirty; /* Maximum number of dirty blocks to write out per
115 wake-cycle */
116 int nrefill; /* Number of clean buffers to try to obtain
117 each time we call refill */
118 int nref_dirt; /* Dirty buffer threshold for activating bdflush
119 when trying to refill buffers. */
120 int interval; /* jiffies delay between kupdate flushes */
121 int age_buffer; /* Time for normal buffer to age before we flush it */
122 int age_super; /* Time for superblock to age before we flush it */
123 int dummy2; /* unused */
124 int dummy3; /* unused */
125 } b_un;
126 unsigned int data[N_PARAM];
127 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
129 /* These are the min and max parameter values that we will allow to be assigned */
130 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
134 * Rewrote the wait-routines to use the "new" wait-queue functionality,
135 * and getting rid of the cli-sti pairs. The wait-queue routines still
136 * need cli-sti, but now it's just a couple of 386 instructions or so.
138 * Note that the real wait_on_buffer() is an inline function that checks
139 * if 'b_wait' is set before calling this, so that the queues aren't set
140 * up unnecessarily.
142 void __wait_on_buffer(struct buffer_head * bh)
144 struct task_struct *tsk = current;
145 DECLARE_WAITQUEUE(wait, tsk);
147 atomic_inc(&bh->b_count);
148 add_wait_queue(&bh->b_wait, &wait);
149 do {
150 run_task_queue(&tq_disk);
151 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
152 if (!buffer_locked(bh))
153 break;
154 schedule();
155 } while (buffer_locked(bh));
156 tsk->state = TASK_RUNNING;
157 remove_wait_queue(&bh->b_wait, &wait);
158 atomic_dec(&bh->b_count);
161 /* Call sync_buffers with wait!=0 to ensure that the call does not
162 * return until all buffer writes have completed. Sync() may return
163 * before the writes have finished; fsync() may not.
166 /* Godamity-damn. Some buffers (bitmaps for filesystems)
167 * spontaneously dirty themselves without ever brelse being called.
168 * We will ultimately want to put these in a separate list, but for
169 * now we search all of the lists for dirty buffers.
171 static int sync_buffers(kdev_t dev, int wait)
173 int i, retry, pass = 0, err = 0;
174 struct buffer_head * bh, *next;
176 /* One pass for no-wait, three for wait:
177 * 0) write out all dirty, unlocked buffers;
178 * 1) write out all dirty buffers, waiting if locked;
179 * 2) wait for completion by waiting for all buffers to unlock.
181 do {
182 retry = 0;
184 /* We search all lists as a failsafe mechanism, not because we expect
185 * there to be dirty buffers on any of the other lists.
187 repeat:
188 spin_lock(&lru_list_lock);
189 bh = lru_list[BUF_DIRTY];
190 if (!bh)
191 goto repeat2;
193 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
194 next = bh->b_next_free;
196 if (!lru_list[BUF_DIRTY])
197 break;
198 if (dev && bh->b_dev != dev)
199 continue;
200 if (buffer_locked(bh)) {
201 /* Buffer is locked; skip it unless wait is
202 * requested AND pass > 0.
204 if (!wait || !pass) {
205 retry = 1;
206 continue;
208 atomic_inc(&bh->b_count);
209 spin_unlock(&lru_list_lock);
210 wait_on_buffer (bh);
211 atomic_dec(&bh->b_count);
212 goto repeat;
215 /* If an unlocked buffer is not uptodate, there has
216 * been an IO error. Skip it.
218 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
219 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
220 err = -EIO;
221 continue;
224 /* Don't write clean buffers. Don't write ANY buffers
225 * on the third pass.
227 if (!buffer_dirty(bh) || pass >= 2)
228 continue;
230 atomic_inc(&bh->b_count);
231 spin_unlock(&lru_list_lock);
232 ll_rw_block(WRITE, 1, &bh);
233 atomic_dec(&bh->b_count);
234 retry = 1;
235 goto repeat;
238 repeat2:
239 bh = lru_list[BUF_LOCKED];
240 if (!bh) {
241 spin_unlock(&lru_list_lock);
242 break;
244 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
245 next = bh->b_next_free;
247 if (!lru_list[BUF_LOCKED])
248 break;
249 if (dev && bh->b_dev != dev)
250 continue;
251 if (buffer_locked(bh)) {
252 /* Buffer is locked; skip it unless wait is
253 * requested AND pass > 0.
255 if (!wait || !pass) {
256 retry = 1;
257 continue;
259 atomic_inc(&bh->b_count);
260 spin_unlock(&lru_list_lock);
261 wait_on_buffer (bh);
262 spin_lock(&lru_list_lock);
263 atomic_dec(&bh->b_count);
264 goto repeat2;
267 spin_unlock(&lru_list_lock);
269 /* If we are waiting for the sync to succeed, and if any dirty
270 * blocks were written, then repeat; on the second pass, only
271 * wait for buffers being written (do not pass to write any
272 * more buffers on the second pass).
274 } while (wait && retry && ++pass<=2);
275 return err;
278 void sync_dev(kdev_t dev)
280 sync_supers(dev);
281 sync_inodes(dev);
282 DQUOT_SYNC(dev);
283 /* sync all the dirty buffers out to disk only _after_ all the
284 high level layers finished generated buffer dirty data
285 (or we'll return with some buffer still dirty on the blockdevice
286 so breaking the semantics of this call) */
287 sync_buffers(dev, 0);
289 * FIXME(eric) we need to sync the physical devices here.
290 * This is because some (scsi) controllers have huge amounts of
291 * cache onboard (hundreds of Mb), and we need to instruct
292 * them to commit all of the dirty memory to disk, and we should
293 * not return until this has happened.
295 * This would need to get implemented by going through the assorted
296 * layers so that each block major number can be synced, and this
297 * would call down into the upper and mid-layer scsi.
301 int fsync_dev(kdev_t dev)
303 sync_buffers(dev, 0);
305 lock_kernel();
306 sync_supers(dev);
307 sync_inodes(dev);
308 DQUOT_SYNC(dev);
309 unlock_kernel();
311 return sync_buffers(dev, 1);
314 asmlinkage long sys_sync(void)
316 fsync_dev(0);
317 return 0;
321 * filp may be NULL if called via the msync of a vma.
324 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
326 struct inode * inode = dentry->d_inode;
327 struct super_block * sb;
328 kdev_t dev;
329 int ret;
331 lock_kernel();
332 /* sync the inode to buffers */
333 write_inode_now(inode, 0);
335 /* sync the superblock to buffers */
336 sb = inode->i_sb;
337 wait_on_super(sb);
338 if (sb->s_op && sb->s_op->write_super)
339 sb->s_op->write_super(sb);
341 /* .. finally sync the buffers to disk */
342 dev = inode->i_dev;
343 ret = sync_buffers(dev, 1);
344 unlock_kernel();
345 return ret;
348 asmlinkage long sys_fsync(unsigned int fd)
350 struct file * file;
351 struct dentry * dentry;
352 struct inode * inode;
353 int err;
355 err = -EBADF;
356 file = fget(fd);
357 if (!file)
358 goto out;
360 dentry = file->f_dentry;
361 inode = dentry->d_inode;
363 err = -EINVAL;
364 if (!file->f_op || !file->f_op->fsync)
365 goto out_putf;
367 /* We need to protect against concurrent writers.. */
368 down(&inode->i_sem);
369 err = file->f_op->fsync(file, dentry, 0);
370 up(&inode->i_sem);
372 out_putf:
373 fput(file);
374 out:
375 return err;
378 asmlinkage long sys_fdatasync(unsigned int fd)
380 struct file * file;
381 struct dentry * dentry;
382 struct inode * inode;
383 int err;
385 err = -EBADF;
386 file = fget(fd);
387 if (!file)
388 goto out;
390 dentry = file->f_dentry;
391 inode = dentry->d_inode;
393 err = -EINVAL;
394 if (!file->f_op || !file->f_op->fsync)
395 goto out_putf;
397 down(&inode->i_sem);
398 err = file->f_op->fsync(file, dentry, 1);
399 up(&inode->i_sem);
401 out_putf:
402 fput(file);
403 out:
404 return err;
407 /* After several hours of tedious analysis, the following hash
408 * function won. Do not mess with it... -DaveM
410 #define _hashfn(dev,block) \
411 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
412 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
415 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
417 if ((bh->b_next = *head) != NULL)
418 bh->b_next->b_pprev = &bh->b_next;
419 *head = bh;
420 bh->b_pprev = head;
423 static __inline__ void __hash_unlink(struct buffer_head *bh)
425 if (bh->b_pprev) {
426 if (bh->b_next)
427 bh->b_next->b_pprev = bh->b_pprev;
428 *(bh->b_pprev) = bh->b_next;
429 bh->b_pprev = NULL;
433 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
435 struct buffer_head **bhp = &lru_list[blist];
437 if(!*bhp) {
438 *bhp = bh;
439 bh->b_prev_free = bh;
441 bh->b_next_free = *bhp;
442 bh->b_prev_free = (*bhp)->b_prev_free;
443 (*bhp)->b_prev_free->b_next_free = bh;
444 (*bhp)->b_prev_free = bh;
445 nr_buffers_type[blist]++;
446 size_buffers_type[blist] += bh->b_size;
449 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
451 if (bh->b_prev_free || bh->b_next_free) {
452 bh->b_prev_free->b_next_free = bh->b_next_free;
453 bh->b_next_free->b_prev_free = bh->b_prev_free;
454 if (lru_list[blist] == bh)
455 lru_list[blist] = bh->b_next_free;
456 if (lru_list[blist] == bh)
457 lru_list[blist] = NULL;
458 bh->b_next_free = bh->b_prev_free = NULL;
459 nr_buffers_type[blist]--;
460 size_buffers_type[blist] -= bh->b_size;
464 static void __remove_from_free_list(struct buffer_head * bh, int index)
466 if(bh->b_next_free == bh)
467 free_list[index].list = NULL;
468 else {
469 bh->b_prev_free->b_next_free = bh->b_next_free;
470 bh->b_next_free->b_prev_free = bh->b_prev_free;
471 if (free_list[index].list == bh)
472 free_list[index].list = bh->b_next_free;
474 bh->b_next_free = bh->b_prev_free = NULL;
477 /* must be called with both the hash_table_lock and the lru_list_lock
478 held */
479 static void __remove_from_queues(struct buffer_head *bh)
481 __hash_unlink(bh);
482 __remove_from_lru_list(bh, bh->b_list);
485 static void __insert_into_queues(struct buffer_head *bh)
487 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
489 __hash_link(bh, head);
490 __insert_into_lru_list(bh, bh->b_list);
493 /* This function must only run if there are no other
494 * references _anywhere_ to this buffer head.
496 static void put_last_free(struct buffer_head * bh)
498 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
499 struct buffer_head **bhp = &head->list;
501 bh->b_state = 0;
503 spin_lock(&head->lock);
504 bh->b_dev = B_FREE;
505 if(!*bhp) {
506 *bhp = bh;
507 bh->b_prev_free = bh;
509 bh->b_next_free = *bhp;
510 bh->b_prev_free = (*bhp)->b_prev_free;
511 (*bhp)->b_prev_free->b_next_free = bh;
512 (*bhp)->b_prev_free = bh;
513 spin_unlock(&head->lock);
517 * Why like this, I hear you say... The reason is race-conditions.
518 * As we don't lock buffers (unless we are reading them, that is),
519 * something might happen to it while we sleep (ie a read-error
520 * will force it bad). This shouldn't really happen currently, but
521 * the code is ready.
523 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
525 struct buffer_head *bh = hash(dev, block);
527 for (; bh; bh = bh->b_next)
528 if (bh->b_blocknr == block &&
529 bh->b_size == size &&
530 bh->b_dev == dev)
531 break;
532 if (bh)
533 atomic_inc(&bh->b_count);
535 return bh;
538 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
540 struct buffer_head *bh;
542 read_lock(&hash_table_lock);
543 bh = __get_hash_table(dev, block, size);
544 read_unlock(&hash_table_lock);
546 return bh;
549 unsigned int get_hardblocksize(kdev_t dev)
552 * Get the hard sector size for the given device. If we don't know
553 * what it is, return 0.
555 if (hardsect_size[MAJOR(dev)] != NULL) {
556 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
557 if (blksize != 0)
558 return blksize;
562 * We don't know what the hardware sector size for this device is.
563 * Return 0 indicating that we don't know.
565 return 0;
568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
569 of fs corruption is going on. Trashing dirty data always imply losing
570 information that was supposed to be just stored on the physical layer
571 by the user.
573 Thus invalidate_buffers in general usage is not allwowed to trash dirty
574 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
576 NOTE: In the case where the user removed a removable-media-disk even if
577 there's still dirty data not synced on disk (due a bug in the device driver
578 or due an error of the user), by not destroying the dirty buffers we could
579 generate corruption also on the next media inserted, thus a parameter is
580 necessary to handle this case in the most safe way possible (trying
581 to not corrupt also the new disk inserted with the data belonging to
582 the old now corrupted disk). Also for the ramdisk the natural thing
583 to do in order to release the ramdisk memory is to destroy dirty buffers.
585 These are two special cases. Normal usage imply the device driver
586 to issue a sync on the device (without waiting I/O completation) and
587 then an invalidate_buffers call that doesn't trashes dirty buffers. */
588 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
590 int i, nlist, slept;
591 struct buffer_head * bh, * bh_next;
593 retry:
594 slept = 0;
595 spin_lock(&lru_list_lock);
596 for(nlist = 0; nlist < NR_LIST; nlist++) {
597 bh = lru_list[nlist];
598 if (!bh)
599 continue;
600 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
601 bh_next = bh->b_next_free;
602 if (bh->b_dev != dev)
603 continue;
604 if (buffer_locked(bh)) {
605 atomic_inc(&bh->b_count);
606 spin_unlock(&lru_list_lock);
607 wait_on_buffer(bh);
608 slept = 1;
609 spin_lock(&lru_list_lock);
610 atomic_dec(&bh->b_count);
613 write_lock(&hash_table_lock);
614 if (!atomic_read(&bh->b_count) &&
615 (destroy_dirty_buffers || !buffer_dirty(bh))) {
616 __remove_from_queues(bh);
617 put_last_free(bh);
619 write_unlock(&hash_table_lock);
620 if (slept)
621 goto out;
624 out:
625 spin_unlock(&lru_list_lock);
626 if (slept)
627 goto retry;
630 void set_blocksize(kdev_t dev, int size)
632 extern int *blksize_size[];
633 int i, nlist, slept;
634 struct buffer_head * bh, * bh_next;
636 if (!blksize_size[MAJOR(dev)])
637 return;
639 /* Size must be a power of two, and between 512 and PAGE_SIZE */
640 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
641 panic("Invalid blocksize passed to set_blocksize");
643 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
644 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
645 return;
647 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
648 return;
649 sync_buffers(dev, 2);
650 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
652 retry:
653 slept = 0;
654 spin_lock(&lru_list_lock);
655 for(nlist = 0; nlist < NR_LIST; nlist++) {
656 bh = lru_list[nlist];
657 if (!bh)
658 continue;
659 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
660 bh_next = bh->b_next_free;
661 if (bh->b_dev != dev || bh->b_size == size)
662 continue;
663 if (buffer_locked(bh)) {
664 atomic_inc(&bh->b_count);
665 spin_unlock(&lru_list_lock);
666 wait_on_buffer(bh);
667 slept = 1;
668 spin_lock(&lru_list_lock);
669 atomic_dec(&bh->b_count);
672 write_lock(&hash_table_lock);
673 if (!atomic_read(&bh->b_count)) {
674 if (buffer_dirty(bh))
675 printk(KERN_WARNING
676 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
677 kdevname(dev), bh->b_blocknr, bh->b_size);
678 __remove_from_queues(bh);
679 put_last_free(bh);
680 } else {
681 if (atomic_set_buffer_clean(bh))
682 __refile_buffer(bh);
683 clear_bit(BH_Uptodate, &bh->b_state);
684 printk(KERN_WARNING
685 "set_blocksize: "
686 "b_count %d, dev %s, block %lu, from %p\n",
687 atomic_read(&bh->b_count), bdevname(bh->b_dev),
688 bh->b_blocknr, __builtin_return_address(0));
690 write_unlock(&hash_table_lock);
691 if (slept)
692 goto out;
695 out:
696 spin_unlock(&lru_list_lock);
697 if (slept)
698 goto retry;
702 * We used to try various strange things. Let's not.
704 static void refill_freelist(int size)
706 if (!grow_buffers(size)) {
707 wakeup_bdflush(1);
708 current->policy |= SCHED_YIELD;
709 schedule();
713 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
715 bh->b_list = BUF_CLEAN;
716 bh->b_end_io = handler;
717 bh->b_private = private;
720 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
722 mark_buffer_uptodate(bh, uptodate);
723 unlock_buffer(bh);
726 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
728 mark_buffer_uptodate(bh, uptodate);
729 unlock_buffer(bh);
730 BUG();
733 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
735 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
736 unsigned long flags;
737 struct buffer_head *tmp;
738 struct page *page;
740 mark_buffer_uptodate(bh, uptodate);
742 /* This is a temporary buffer used for page I/O. */
743 page = bh->b_page;
745 if (!uptodate)
746 SetPageError(page);
749 * Be _very_ careful from here on. Bad things can happen if
750 * two buffer heads end IO at almost the same time and both
751 * decide that the page is now completely done.
753 * Async buffer_heads are here only as labels for IO, and get
754 * thrown away once the IO for this page is complete. IO is
755 * deemed complete once all buffers have been visited
756 * (b_count==0) and are now unlocked. We must make sure that
757 * only the _last_ buffer that decrements its count is the one
758 * that unlock the page..
760 spin_lock_irqsave(&page_uptodate_lock, flags);
761 unlock_buffer(bh);
762 atomic_dec(&bh->b_count);
763 tmp = bh->b_this_page;
764 while (tmp != bh) {
765 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
766 goto still_busy;
767 tmp = tmp->b_this_page;
770 /* OK, the async IO on this page is complete. */
771 spin_unlock_irqrestore(&page_uptodate_lock, flags);
774 * if none of the buffers had errors then we can set the
775 * page uptodate:
777 if (!PageError(page))
778 SetPageUptodate(page);
781 * Run the hooks that have to be done when a page I/O has completed.
783 if (PageTestandClearDecrAfter(page))
784 atomic_dec(&nr_async_pages);
786 UnlockPage(page);
788 return;
790 still_busy:
791 spin_unlock_irqrestore(&page_uptodate_lock, flags);
792 return;
796 * Ok, this is getblk, and it isn't very clear, again to hinder
797 * race-conditions. Most of the code is seldom used, (ie repeating),
798 * so it should be much more efficient than it looks.
800 * The algorithm is changed: hopefully better, and an elusive bug removed.
802 * 14.02.92: changed it to sync dirty buffers a bit: better performance
803 * when the filesystem starts to get full of dirty blocks (I hope).
805 struct buffer_head * getblk(kdev_t dev, int block, int size)
807 struct buffer_head * bh;
808 int isize;
810 repeat:
811 spin_lock(&lru_list_lock);
812 write_lock(&hash_table_lock);
813 bh = __get_hash_table(dev, block, size);
814 if (bh)
815 goto out;
817 isize = BUFSIZE_INDEX(size);
818 spin_lock(&free_list[isize].lock);
819 bh = free_list[isize].list;
820 if (bh) {
821 __remove_from_free_list(bh, isize);
822 atomic_set(&bh->b_count, 1);
824 spin_unlock(&free_list[isize].lock);
827 * OK, FINALLY we know that this buffer is the only one of
828 * its kind, we hold a reference (b_count>0), it is unlocked,
829 * and it is clean.
831 if (bh) {
832 init_buffer(bh, end_buffer_io_sync, NULL);
833 bh->b_dev = dev;
834 bh->b_blocknr = block;
835 bh->b_state = 1 << BH_Mapped;
837 /* Insert the buffer into the regular lists */
838 __insert_into_queues(bh);
839 out:
840 write_unlock(&hash_table_lock);
841 spin_unlock(&lru_list_lock);
842 return bh;
846 * If we block while refilling the free list, somebody may
847 * create the buffer first ... search the hashes again.
849 write_unlock(&hash_table_lock);
850 spin_unlock(&lru_list_lock);
851 refill_freelist(size);
852 goto repeat;
855 /* -1 -> no need to flush
856 0 -> async flush
857 1 -> sync flush (wait for I/O completation) */
858 static int balance_dirty_state(kdev_t dev)
860 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
862 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
863 tot = nr_free_buffer_pages();
864 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
866 dirty *= 200;
867 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
868 hard_dirty_limit = soft_dirty_limit * 2;
870 if (dirty > soft_dirty_limit) {
871 if (dirty > hard_dirty_limit)
872 return 1;
873 return 0;
875 return -1;
879 * if a new dirty buffer is created we need to balance bdflush.
881 * in the future we might want to make bdflush aware of different
882 * pressures on different devices - thus the (currently unused)
883 * 'dev' parameter.
885 void balance_dirty(kdev_t dev)
887 int state = balance_dirty_state(dev);
889 if (state < 0)
890 return;
891 wakeup_bdflush(state);
894 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
896 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
897 refile_buffer(bh);
900 /* atomic version, the user must call balance_dirty() by hand
901 as soon as it become possible to block */
902 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
904 if (!atomic_set_buffer_dirty(bh))
905 __mark_dirty(bh, flag);
908 void mark_buffer_dirty(struct buffer_head *bh, int flag)
910 __mark_buffer_dirty(bh, flag);
911 balance_dirty(bh->b_dev);
915 * A buffer may need to be moved from one buffer list to another
916 * (e.g. in case it is not shared any more). Handle this.
918 static void __refile_buffer(struct buffer_head *bh)
920 int dispose = BUF_CLEAN;
921 if (buffer_locked(bh))
922 dispose = BUF_LOCKED;
923 if (buffer_dirty(bh))
924 dispose = BUF_DIRTY;
925 if (buffer_protected(bh))
926 dispose = BUF_PROTECTED;
927 if (dispose != bh->b_list) {
928 __remove_from_lru_list(bh, bh->b_list);
929 bh->b_list = dispose;
930 __insert_into_lru_list(bh, dispose);
934 void refile_buffer(struct buffer_head *bh)
936 spin_lock(&lru_list_lock);
937 __refile_buffer(bh);
938 spin_unlock(&lru_list_lock);
942 * Release a buffer head
944 void __brelse(struct buffer_head * buf)
946 if (atomic_read(&buf->b_count)) {
947 atomic_dec(&buf->b_count);
948 return;
950 printk("VFS: brelse: Trying to free free buffer\n");
954 * bforget() is like brelse(), except it puts the buffer on the
955 * free list if it can.. We can NOT free the buffer if:
956 * - there are other users of it
957 * - it is locked and thus can have active IO
959 void __bforget(struct buffer_head * buf)
961 /* grab the lru lock here to block bdflush. */
962 spin_lock(&lru_list_lock);
963 write_lock(&hash_table_lock);
964 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
965 goto in_use;
966 __hash_unlink(buf);
967 write_unlock(&hash_table_lock);
968 __remove_from_lru_list(buf, buf->b_list);
969 spin_unlock(&lru_list_lock);
970 put_last_free(buf);
971 return;
973 in_use:
974 write_unlock(&hash_table_lock);
975 spin_unlock(&lru_list_lock);
979 * bread() reads a specified block and returns the buffer that contains
980 * it. It returns NULL if the block was unreadable.
982 struct buffer_head * bread(kdev_t dev, int block, int size)
984 struct buffer_head * bh;
986 bh = getblk(dev, block, size);
987 if (buffer_uptodate(bh))
988 return bh;
989 ll_rw_block(READ, 1, &bh);
990 wait_on_buffer(bh);
991 if (buffer_uptodate(bh))
992 return bh;
993 brelse(bh);
994 return NULL;
998 * Ok, breada can be used as bread, but additionally to mark other
999 * blocks for reading as well. End the argument list with a negative
1000 * number.
1003 #define NBUF 16
1005 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1006 unsigned int pos, unsigned int filesize)
1008 struct buffer_head * bhlist[NBUF];
1009 unsigned int blocks;
1010 struct buffer_head * bh;
1011 int index;
1012 int i, j;
1014 if (pos >= filesize)
1015 return NULL;
1017 if (block < 0)
1018 return NULL;
1020 bh = getblk(dev, block, bufsize);
1021 index = BUFSIZE_INDEX(bh->b_size);
1023 if (buffer_uptodate(bh))
1024 return(bh);
1025 else ll_rw_block(READ, 1, &bh);
1027 blocks = (filesize - pos) >> (9+index);
1029 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1030 blocks = read_ahead[MAJOR(dev)] >> index;
1031 if (blocks > NBUF)
1032 blocks = NBUF;
1034 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1036 bhlist[0] = bh;
1037 j = 1;
1038 for(i=1; i<blocks; i++) {
1039 bh = getblk(dev,block+i,bufsize);
1040 if (buffer_uptodate(bh)) {
1041 brelse(bh);
1042 break;
1044 else bhlist[j++] = bh;
1047 /* Request the read for these buffers, and then release them. */
1048 if (j>1)
1049 ll_rw_block(READA, (j-1), bhlist+1);
1050 for(i=1; i<j; i++)
1051 brelse(bhlist[i]);
1053 /* Wait for this buffer, and then continue on. */
1054 bh = bhlist[0];
1055 wait_on_buffer(bh);
1056 if (buffer_uptodate(bh))
1057 return bh;
1058 brelse(bh);
1059 return NULL;
1063 * Note: the caller should wake up the buffer_wait list if needed.
1065 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1067 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1068 kmem_cache_free(bh_cachep, bh);
1069 } else {
1070 bh->b_blocknr = -1;
1071 init_waitqueue_head(&bh->b_wait);
1072 nr_unused_buffer_heads++;
1073 bh->b_next_free = unused_list;
1074 bh->b_this_page = NULL;
1075 unused_list = bh;
1080 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1081 * no-buffer-head deadlock. Return NULL on failure; waiting for
1082 * buffer heads is now handled in create_buffers().
1084 static struct buffer_head * get_unused_buffer_head(int async)
1086 struct buffer_head * bh;
1088 spin_lock(&unused_list_lock);
1089 if (nr_unused_buffer_heads > NR_RESERVED) {
1090 bh = unused_list;
1091 unused_list = bh->b_next_free;
1092 nr_unused_buffer_heads--;
1093 spin_unlock(&unused_list_lock);
1094 return bh;
1096 spin_unlock(&unused_list_lock);
1098 /* This is critical. We can't swap out pages to get
1099 * more buffer heads, because the swap-out may need
1100 * more buffer-heads itself. Thus SLAB_BUFFER.
1102 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1103 memset(bh, 0, sizeof(*bh));
1104 init_waitqueue_head(&bh->b_wait);
1105 return bh;
1109 * If we need an async buffer, use the reserved buffer heads.
1111 if (async) {
1112 spin_lock(&unused_list_lock);
1113 if (unused_list) {
1114 bh = unused_list;
1115 unused_list = bh->b_next_free;
1116 nr_unused_buffer_heads--;
1117 spin_unlock(&unused_list_lock);
1118 return bh;
1120 spin_unlock(&unused_list_lock);
1122 #if 0
1124 * (Pending further analysis ...)
1125 * Ordinary (non-async) requests can use a different memory priority
1126 * to free up pages. Any swapping thus generated will use async
1127 * buffer heads.
1129 if(!async &&
1130 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1131 memset(bh, 0, sizeof(*bh));
1132 init_waitqueue_head(&bh->b_wait);
1133 return bh;
1135 #endif
1137 return NULL;
1140 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1142 bh->b_page = page;
1143 if (offset >= PAGE_SIZE)
1144 BUG();
1145 if (PageHighMem(page))
1147 * This catches illegal uses and preserves the offset:
1149 bh->b_data = (char *)(0 + offset);
1150 else
1151 bh->b_data = (char *)(page_address(page) + offset);
1155 * Create the appropriate buffers when given a page for data area and
1156 * the size of each buffer.. Use the bh->b_this_page linked list to
1157 * follow the buffers created. Return NULL if unable to create more
1158 * buffers.
1159 * The async flag is used to differentiate async IO (paging, swapping)
1160 * from ordinary buffer allocations, and only async requests are allowed
1161 * to sleep waiting for buffer heads.
1163 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1165 struct buffer_head *bh, *head;
1166 long offset;
1168 try_again:
1169 head = NULL;
1170 offset = PAGE_SIZE;
1171 while ((offset -= size) >= 0) {
1172 bh = get_unused_buffer_head(async);
1173 if (!bh)
1174 goto no_grow;
1176 bh->b_dev = B_FREE; /* Flag as unused */
1177 bh->b_this_page = head;
1178 head = bh;
1180 bh->b_state = 0;
1181 bh->b_next_free = NULL;
1182 bh->b_pprev = NULL;
1183 atomic_set(&bh->b_count, 0);
1184 bh->b_size = size;
1186 set_bh_page(bh, page, offset);
1188 bh->b_list = BUF_CLEAN;
1189 bh->b_end_io = end_buffer_io_bad;
1191 return head;
1193 * In case anything failed, we just free everything we got.
1195 no_grow:
1196 if (head) {
1197 spin_lock(&unused_list_lock);
1198 do {
1199 bh = head;
1200 head = head->b_this_page;
1201 __put_unused_buffer_head(bh);
1202 } while (head);
1203 spin_unlock(&unused_list_lock);
1205 /* Wake up any waiters ... */
1206 wake_up(&buffer_wait);
1210 * Return failure for non-async IO requests. Async IO requests
1211 * are not allowed to fail, so we have to wait until buffer heads
1212 * become available. But we don't want tasks sleeping with
1213 * partially complete buffers, so all were released above.
1215 if (!async)
1216 return NULL;
1218 /* We're _really_ low on memory. Now we just
1219 * wait for old buffer heads to become free due to
1220 * finishing IO. Since this is an async request and
1221 * the reserve list is empty, we're sure there are
1222 * async buffer heads in use.
1224 run_task_queue(&tq_disk);
1227 * Set our state for sleeping, then check again for buffer heads.
1228 * This ensures we won't miss a wake_up from an interrupt.
1230 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1231 goto try_again;
1234 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1236 struct buffer_head *head, *bh, *tail;
1237 int block;
1239 if (!PageLocked(page))
1240 BUG();
1242 * Allocate async buffer heads pointing to this page, just for I/O.
1243 * They don't show up in the buffer hash table, but they *are*
1244 * registered in page->buffers.
1246 head = create_buffers(page, size, 1);
1247 if (page->buffers)
1248 BUG();
1249 if (!head)
1250 BUG();
1251 tail = head;
1252 for (bh = head; bh; bh = bh->b_this_page) {
1253 block = *(b++);
1255 tail = bh;
1256 init_buffer(bh, end_buffer_io_async, NULL);
1257 bh->b_dev = dev;
1258 bh->b_blocknr = block;
1260 set_bit(BH_Mapped, &bh->b_state);
1262 tail->b_this_page = head;
1263 page_cache_get(page);
1264 page->buffers = head;
1265 return 0;
1268 static void unmap_buffer(struct buffer_head * bh)
1270 if (buffer_mapped(bh)) {
1271 mark_buffer_clean(bh);
1272 wait_on_buffer(bh);
1273 clear_bit(BH_Uptodate, &bh->b_state);
1274 clear_bit(BH_Mapped, &bh->b_state);
1275 clear_bit(BH_Req, &bh->b_state);
1276 clear_bit(BH_New, &bh->b_state);
1281 * We don't have to release all buffers here, but
1282 * we have to be sure that no dirty buffer is left
1283 * and no IO is going on (no buffer is locked), because
1284 * we have truncated the file and are going to free the
1285 * blocks on-disk..
1287 int block_flushpage(struct page *page, unsigned long offset)
1289 struct buffer_head *head, *bh, *next;
1290 unsigned int curr_off = 0;
1292 if (!PageLocked(page))
1293 BUG();
1294 if (!page->buffers)
1295 return 1;
1297 head = page->buffers;
1298 bh = head;
1299 do {
1300 unsigned int next_off = curr_off + bh->b_size;
1301 next = bh->b_this_page;
1304 * is this block fully flushed?
1306 if (offset <= curr_off)
1307 unmap_buffer(bh);
1308 curr_off = next_off;
1309 bh = next;
1310 } while (bh != head);
1313 * subtle. We release buffer-heads only if this is
1314 * the 'final' flushpage. We have invalidated the get_block
1315 * cached value unconditionally, so real IO is not
1316 * possible anymore.
1318 * If the free doesn't work out, the buffers can be
1319 * left around - they just turn into anonymous buffers
1320 * instead.
1322 if (!offset) {
1323 if (!try_to_free_buffers(page, 0)) {
1324 atomic_inc(&buffermem_pages);
1325 return 0;
1329 return 1;
1332 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1334 struct buffer_head *bh, *head, *tail;
1336 head = create_buffers(page, blocksize, 1);
1337 if (page->buffers)
1338 BUG();
1340 bh = head;
1341 do {
1342 bh->b_dev = inode->i_dev;
1343 bh->b_blocknr = 0;
1344 bh->b_end_io = end_buffer_io_bad;
1345 tail = bh;
1346 bh = bh->b_this_page;
1347 } while (bh);
1348 tail->b_this_page = head;
1349 page->buffers = head;
1350 page_cache_get(page);
1354 * We are taking a block for data and we don't want any output from any
1355 * buffer-cache aliases starting from return from that function and
1356 * until the moment when something will explicitly mark the buffer
1357 * dirty (hopefully that will not happen until we will free that block ;-)
1358 * We don't even need to mark it not-uptodate - nobody can expect
1359 * anything from a newly allocated buffer anyway. We used to used
1360 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1361 * don't want to mark the alias unmapped, for example - it would confuse
1362 * anyone who might pick it with bread() afterwards...
1365 static void unmap_underlying_metadata(struct buffer_head * bh)
1367 struct buffer_head *old_bh;
1369 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1370 if (old_bh) {
1371 mark_buffer_clean(old_bh);
1372 wait_on_buffer(old_bh);
1373 clear_bit(BH_Req, &old_bh->b_state);
1374 /* Here we could run brelse or bforget. We use
1375 bforget because it will try to put the buffer
1376 in the freelist. */
1377 __bforget(old_bh);
1382 * block_write_full_page() is SMP-safe - currently it's still
1383 * being called with the kernel lock held, but the code is ready.
1385 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1387 int err, i, need_balance_dirty = 0;
1388 unsigned long block;
1389 struct buffer_head *bh, *head;
1391 if (!PageLocked(page))
1392 BUG();
1394 if (!page->buffers)
1395 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1396 head = page->buffers;
1398 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1400 bh = head;
1401 i = 0;
1402 do {
1404 * If the buffer isn't up-to-date, we can't be sure
1405 * that the buffer has been initialized with the proper
1406 * block number information etc..
1408 * Leave it to the low-level FS to make all those
1409 * decisions (block #0 may actually be a valid block)
1411 bh->b_end_io = end_buffer_io_sync;
1412 if (!buffer_mapped(bh)) {
1413 err = get_block(inode, block, bh, 1);
1414 if (err)
1415 goto out;
1416 if (buffer_new(bh))
1417 unmap_underlying_metadata(bh);
1419 set_bit(BH_Uptodate, &bh->b_state);
1420 if (!atomic_set_buffer_dirty(bh)) {
1421 __mark_dirty(bh, 0);
1422 need_balance_dirty = 1;
1425 bh = bh->b_this_page;
1426 block++;
1427 } while (bh != head);
1429 if (need_balance_dirty)
1430 balance_dirty(bh->b_dev);
1432 SetPageUptodate(page);
1433 return 0;
1434 out:
1435 ClearPageUptodate(page);
1436 return err;
1439 static int __block_prepare_write(struct inode *inode, struct page *page,
1440 unsigned from, unsigned to, get_block_t *get_block)
1442 unsigned block_start, block_end;
1443 unsigned long block;
1444 int err = 0;
1445 unsigned blocksize, bbits;
1446 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1447 char *kaddr = (char *)kmap(page);
1449 blocksize = inode->i_sb->s_blocksize;
1450 if (!page->buffers)
1451 create_empty_buffers(page, inode, blocksize);
1452 head = page->buffers;
1454 bbits = inode->i_sb->s_blocksize_bits;
1455 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1457 for(bh = head, block_start = 0; bh != head || !block_start;
1458 block++, block_start=block_end, bh = bh->b_this_page) {
1459 if (!bh)
1460 BUG();
1461 block_end = block_start+blocksize;
1462 if (block_end <= from)
1463 continue;
1464 if (block_start >= to)
1465 break;
1466 bh->b_end_io = end_buffer_io_sync;
1467 if (!buffer_mapped(bh)) {
1468 err = get_block(inode, block, bh, 1);
1469 if (err)
1470 goto out;
1471 if (buffer_new(bh)) {
1472 unmap_underlying_metadata(bh);
1473 if (block_end > to)
1474 memset(kaddr+to, 0, block_end-to);
1475 if (block_start < from)
1476 memset(kaddr+block_start, 0, from-block_start);
1477 continue;
1480 if (!buffer_uptodate(bh) &&
1481 (block_start < from || block_end > to)) {
1482 ll_rw_block(READ, 1, &bh);
1483 *wait_bh++=bh;
1487 * If we issued read requests - let them complete.
1489 while(wait_bh > wait) {
1490 wait_on_buffer(*--wait_bh);
1491 err = -EIO;
1492 if (!buffer_uptodate(*wait_bh))
1493 goto out;
1495 return 0;
1496 out:
1497 return err;
1500 static int __block_commit_write(struct inode *inode, struct page *page,
1501 unsigned from, unsigned to)
1503 unsigned block_start, block_end;
1504 int partial = 0, need_balance_dirty = 0;
1505 unsigned blocksize;
1506 struct buffer_head *bh, *head;
1508 blocksize = inode->i_sb->s_blocksize;
1510 for(bh = head = page->buffers, block_start = 0;
1511 bh != head || !block_start;
1512 block_start=block_end, bh = bh->b_this_page) {
1513 block_end = block_start + blocksize;
1514 if (block_end <= from || block_start >= to) {
1515 if (!buffer_uptodate(bh))
1516 partial = 1;
1517 } else {
1518 set_bit(BH_Uptodate, &bh->b_state);
1519 if (!atomic_set_buffer_dirty(bh)) {
1520 __mark_dirty(bh, 0);
1521 need_balance_dirty = 1;
1526 if (need_balance_dirty)
1527 balance_dirty(bh->b_dev);
1529 * is this a partial write that happened to make all buffers
1530 * uptodate then we can optimize away a bogus readpage() for
1531 * the next read(). Here we 'discover' wether the page went
1532 * uptodate as a result of this (potentially partial) write.
1534 if (!partial)
1535 SetPageUptodate(page);
1536 return 0;
1540 * Generic "read page" function for block devices that have the normal
1541 * get_block functionality. This is most of the block device filesystems.
1542 * Reads the page asynchronously --- the unlock_buffer() and
1543 * mark_buffer_uptodate() functions propagate buffer state into the
1544 * page struct once IO has completed.
1546 int block_read_full_page(struct page *page, get_block_t *get_block)
1548 struct inode *inode = (struct inode*)page->mapping->host;
1549 unsigned long iblock, lblock;
1550 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1551 unsigned int blocksize, blocks;
1552 unsigned long kaddr = 0;
1553 int nr, i;
1555 if (!PageLocked(page))
1556 PAGE_BUG(page);
1557 blocksize = inode->i_sb->s_blocksize;
1558 if (!page->buffers)
1559 create_empty_buffers(page, inode, blocksize);
1560 head = page->buffers;
1562 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1563 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1564 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1565 bh = head;
1566 nr = 0;
1567 i = 0;
1569 do {
1570 if (buffer_uptodate(bh))
1571 continue;
1573 if (!buffer_mapped(bh)) {
1574 if (iblock < lblock)
1575 get_block(inode, iblock, bh, 0);
1576 if (!buffer_mapped(bh)) {
1577 if (!kaddr)
1578 kaddr = kmap(page);
1579 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1580 set_bit(BH_Uptodate, &bh->b_state);
1581 continue;
1585 init_buffer(bh, end_buffer_io_async, NULL);
1586 atomic_inc(&bh->b_count);
1587 arr[nr] = bh;
1588 nr++;
1589 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1591 if (nr) {
1592 if (Page_Uptodate(page))
1593 BUG();
1594 ll_rw_block(READ, nr, arr);
1595 } else {
1597 * all buffers are uptodate - we can set the page
1598 * uptodate as well.
1600 SetPageUptodate(page);
1601 UnlockPage(page);
1603 if (kaddr)
1604 kunmap(page);
1605 return 0;
1609 * For moronic filesystems that do not allow holes in file.
1610 * We may have to extend the file.
1613 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1615 struct address_space *mapping = page->mapping;
1616 struct inode *inode = (struct inode*)mapping->host;
1617 struct page *new_page;
1618 unsigned long pgpos;
1619 long status;
1620 unsigned zerofrom;
1621 unsigned blocksize = inode->i_sb->s_blocksize;
1622 char *kaddr;
1624 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1625 status = -ENOMEM;
1626 new_page = grab_cache_page(mapping, pgpos);
1627 if (!new_page)
1628 goto out;
1629 /* we might sleep */
1630 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1631 UnlockPage(new_page);
1632 page_cache_release(new_page);
1633 continue;
1635 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1636 if (zerofrom & (blocksize-1)) {
1637 *bytes |= (blocksize-1);
1638 (*bytes)++;
1640 status = __block_prepare_write(inode, new_page, zerofrom,
1641 PAGE_CACHE_SIZE, get_block);
1642 if (status)
1643 goto out_unmap;
1644 kaddr = (char*)page_address(new_page);
1645 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1646 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1647 kunmap(new_page);
1648 UnlockPage(new_page);
1649 page_cache_release(new_page);
1652 if (page->index < pgpos) {
1653 /* completely inside the area */
1654 zerofrom = offset;
1655 } else {
1656 /* page covers the boundary, find the boundary offset */
1657 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1659 /* if we will expand the thing last block will be filled */
1660 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1661 *bytes |= (blocksize-1);
1662 (*bytes)++;
1665 /* starting below the boundary? Nothing to zero out */
1666 if (offset <= zerofrom)
1667 zerofrom = offset;
1669 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1670 if (status)
1671 goto out1;
1672 kaddr = (char*)page_address(page);
1673 if (zerofrom < offset) {
1674 memset(kaddr+zerofrom, 0, offset-zerofrom);
1675 __block_commit_write(inode, page, zerofrom, offset);
1677 return 0;
1678 out1:
1679 ClearPageUptodate(page);
1680 kunmap(page);
1681 return status;
1683 out_unmap:
1684 ClearPageUptodate(new_page);
1685 kunmap(new_page);
1686 UnlockPage(new_page);
1687 page_cache_release(new_page);
1688 out:
1689 return status;
1692 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1693 get_block_t *get_block)
1695 struct inode *inode = (struct inode*)page->mapping->host;
1696 int err = __block_prepare_write(inode, page, from, to, get_block);
1697 if (err) {
1698 ClearPageUptodate(page);
1699 kunmap(page);
1701 return err;
1704 int generic_commit_write(struct file *file, struct page *page,
1705 unsigned from, unsigned to)
1707 struct inode *inode = (struct inode*)page->mapping->host;
1708 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1709 __block_commit_write(inode,page,from,to);
1710 kunmap(page);
1711 if (pos > inode->i_size)
1712 inode->i_size = pos;
1713 return 0;
1716 int block_write_full_page(struct page *page, get_block_t *get_block)
1718 struct inode *inode = (struct inode*)page->mapping->host;
1719 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1720 unsigned offset;
1721 int err;
1723 /* easy case */
1724 if (page->index < end_index)
1725 return __block_write_full_page(inode, page, get_block);
1727 /* things got complicated... */
1728 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1729 /* OK, are we completely out? */
1730 if (page->index >= end_index+1 || !offset)
1731 return -EIO;
1732 /* Sigh... will have to work, then... */
1733 err = __block_prepare_write(inode, page, 0, offset, get_block);
1734 if (!err) {
1735 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1736 __block_commit_write(inode,page,0,offset);
1737 done:
1738 kunmap(page);
1739 return err;
1741 ClearPageUptodate(page);
1742 goto done;
1745 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1747 struct buffer_head tmp;
1748 struct inode *inode = (struct inode*)mapping->host;
1749 tmp.b_state = 0;
1750 tmp.b_blocknr = 0;
1751 get_block(inode, block, &tmp, 0);
1752 return tmp.b_blocknr;
1756 * IO completion routine for a buffer_head being used for kiobuf IO: we
1757 * can't dispatch the kiobuf callback until io_count reaches 0.
1760 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1762 struct kiobuf *kiobuf;
1764 mark_buffer_uptodate(bh, uptodate);
1766 kiobuf = bh->b_private;
1767 unlock_buffer(bh);
1768 end_kio_request(kiobuf, uptodate);
1773 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1774 * for them to complete. Clean up the buffer_heads afterwards.
1777 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1779 int iosize;
1780 int i;
1781 struct buffer_head *tmp;
1783 if (rw == WRITE)
1784 rw = WRITERAW;
1785 ll_rw_block(rw, nr, bh);
1787 iosize = 0;
1788 spin_lock(&unused_list_lock);
1790 for (i = nr; --i >= 0; ) {
1791 iosize += size;
1792 tmp = bh[i];
1793 if (buffer_locked(tmp)) {
1794 spin_unlock(&unused_list_lock);
1795 wait_on_buffer(tmp);
1796 spin_lock(&unused_list_lock);
1799 if (!buffer_uptodate(tmp)) {
1800 /* We are traversing bh'es in reverse order so
1801 clearing iosize on error calculates the
1802 amount of IO before the first error. */
1803 iosize = 0;
1805 __put_unused_buffer_head(tmp);
1808 spin_unlock(&unused_list_lock);
1810 return iosize;
1814 * Start I/O on a physical range of kernel memory, defined by a vector
1815 * of kiobuf structs (much like a user-space iovec list).
1817 * The kiobuf must already be locked for IO. IO is submitted
1818 * asynchronously: you need to check page->locked, page->uptodate, and
1819 * maybe wait on page->wait.
1821 * It is up to the caller to make sure that there are enough blocks
1822 * passed in to completely map the iobufs to disk.
1825 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1826 kdev_t dev, unsigned long b[], int size)
1828 int err;
1829 int length;
1830 int transferred;
1831 int i;
1832 int bufind;
1833 int pageind;
1834 int bhind;
1835 int offset;
1836 unsigned long blocknr;
1837 struct kiobuf * iobuf = NULL;
1838 struct page * map;
1839 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1841 if (!nr)
1842 return 0;
1845 * First, do some alignment and validity checks
1847 for (i = 0; i < nr; i++) {
1848 iobuf = iovec[i];
1849 if ((iobuf->offset & (size-1)) ||
1850 (iobuf->length & (size-1)))
1851 return -EINVAL;
1852 if (!iobuf->nr_pages)
1853 panic("brw_kiovec: iobuf not initialised");
1857 * OK to walk down the iovec doing page IO on each page we find.
1859 bufind = bhind = transferred = err = 0;
1860 for (i = 0; i < nr; i++) {
1861 iobuf = iovec[i];
1862 offset = iobuf->offset;
1863 length = iobuf->length;
1864 iobuf->errno = 0;
1866 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1867 map = iobuf->maplist[pageind];
1868 if (!map) {
1869 err = -EFAULT;
1870 goto error;
1873 while (length > 0) {
1874 blocknr = b[bufind++];
1875 tmp = get_unused_buffer_head(0);
1876 if (!tmp) {
1877 err = -ENOMEM;
1878 goto error;
1881 tmp->b_dev = B_FREE;
1882 tmp->b_size = size;
1883 set_bh_page(tmp, map, offset);
1884 tmp->b_this_page = tmp;
1886 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1887 tmp->b_dev = dev;
1888 tmp->b_blocknr = blocknr;
1889 tmp->b_state = 1 << BH_Mapped;
1891 if (rw == WRITE) {
1892 set_bit(BH_Uptodate, &tmp->b_state);
1893 set_bit(BH_Dirty, &tmp->b_state);
1896 bh[bhind++] = tmp;
1897 length -= size;
1898 offset += size;
1900 atomic_inc(&iobuf->io_count);
1903 * Start the IO if we have got too much
1905 if (bhind >= KIO_MAX_SECTORS) {
1906 err = do_kio(rw, bhind, bh, size);
1907 if (err >= 0)
1908 transferred += err;
1909 else
1910 goto finished;
1911 bhind = 0;
1914 if (offset >= PAGE_SIZE) {
1915 offset = 0;
1916 break;
1918 } /* End of block loop */
1919 } /* End of page loop */
1920 } /* End of iovec loop */
1922 /* Is there any IO still left to submit? */
1923 if (bhind) {
1924 err = do_kio(rw, bhind, bh, size);
1925 if (err >= 0)
1926 transferred += err;
1927 else
1928 goto finished;
1931 finished:
1932 if (transferred)
1933 return transferred;
1934 return err;
1936 error:
1937 /* We got an error allocating the bh'es. Just free the current
1938 buffer_heads and exit. */
1939 spin_lock(&unused_list_lock);
1940 for (i = bhind; --i >= 0; ) {
1941 __put_unused_buffer_head(bh[bhind]);
1943 spin_unlock(&unused_list_lock);
1944 goto finished;
1948 * Start I/O on a page.
1949 * This function expects the page to be locked and may return
1950 * before I/O is complete. You then have to check page->locked,
1951 * page->uptodate, and maybe wait on page->wait.
1953 * brw_page() is SMP-safe, although it's being called with the
1954 * kernel lock held - but the code is ready.
1956 * FIXME: we need a swapper_inode->get_block function to remove
1957 * some of the bmap kludges and interface ugliness here.
1959 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1961 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1962 int nr, fresh /* temporary debugging flag */, block;
1964 if (!PageLocked(page))
1965 panic("brw_page: page not locked for I/O");
1966 // ClearPageError(page);
1968 * We pretty much rely on the page lock for this, because
1969 * create_page_buffers() might sleep.
1971 fresh = 0;
1972 if (!page->buffers) {
1973 create_page_buffers(rw, page, dev, b, size);
1974 fresh = 1;
1976 if (!page->buffers)
1977 BUG();
1979 head = page->buffers;
1980 bh = head;
1981 nr = 0;
1982 do {
1983 block = *(b++);
1985 if (fresh && (atomic_read(&bh->b_count) != 0))
1986 BUG();
1987 if (rw == READ) {
1988 if (!fresh)
1989 BUG();
1990 if (!buffer_uptodate(bh)) {
1991 arr[nr++] = bh;
1992 atomic_inc(&bh->b_count);
1994 } else { /* WRITE */
1995 if (!bh->b_blocknr) {
1996 if (!block)
1997 BUG();
1998 bh->b_blocknr = block;
1999 } else {
2000 if (!block)
2001 BUG();
2003 set_bit(BH_Uptodate, &bh->b_state);
2004 set_bit(BH_Dirty, &bh->b_state);
2005 arr[nr++] = bh;
2006 atomic_inc(&bh->b_count);
2008 bh = bh->b_this_page;
2009 } while (bh != head);
2010 if ((rw == READ) && nr) {
2011 if (Page_Uptodate(page))
2012 BUG();
2013 ll_rw_block(rw, nr, arr);
2014 } else {
2015 if (!nr && rw == READ) {
2016 SetPageUptodate(page);
2017 UnlockPage(page);
2019 if (nr && (rw == WRITE))
2020 ll_rw_block(rw, nr, arr);
2022 return 0;
2025 int block_symlink(struct inode *inode, const char *symname, int len)
2027 struct address_space *mapping = inode->i_mapping;
2028 struct page *page = grab_cache_page(mapping, 0);
2029 int err = -ENOMEM;
2030 char *kaddr;
2032 if (!page)
2033 goto fail;
2034 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2035 if (err)
2036 goto fail_map;
2037 kaddr = (char*)page_address(page);
2038 memcpy(kaddr, symname, len-1);
2039 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2041 * Notice that we are _not_ going to block here - end of page is
2042 * unmapped, so this will only try to map the rest of page, see
2043 * that it is unmapped (typically even will not look into inode -
2044 * ->i_size will be enough for everything) and zero it out.
2045 * OTOH it's obviously correct and should make the page up-to-date.
2047 err = mapping->a_ops->readpage(NULL, page);
2048 wait_on_page(page);
2049 page_cache_release(page);
2050 if (err < 0)
2051 goto fail;
2052 mark_inode_dirty(inode);
2053 return 0;
2054 fail_map:
2055 UnlockPage(page);
2056 page_cache_release(page);
2057 fail:
2058 return err;
2062 * Try to increase the number of buffers available: the size argument
2063 * is used to determine what kind of buffers we want.
2065 static int grow_buffers(int size)
2067 struct page * page;
2068 struct buffer_head *bh, *tmp;
2069 struct buffer_head * insert_point;
2070 int isize;
2072 if ((size & 511) || (size > PAGE_SIZE)) {
2073 printk("VFS: grow_buffers: size = %d\n",size);
2074 return 0;
2077 page = alloc_page(GFP_BUFFER);
2078 if (!page)
2079 goto out;
2080 bh = create_buffers(page, size, 0);
2081 if (!bh)
2082 goto no_buffer_head;
2084 isize = BUFSIZE_INDEX(size);
2086 spin_lock(&free_list[isize].lock);
2087 insert_point = free_list[isize].list;
2088 tmp = bh;
2089 while (1) {
2090 if (insert_point) {
2091 tmp->b_next_free = insert_point->b_next_free;
2092 tmp->b_prev_free = insert_point;
2093 insert_point->b_next_free->b_prev_free = tmp;
2094 insert_point->b_next_free = tmp;
2095 } else {
2096 tmp->b_prev_free = tmp;
2097 tmp->b_next_free = tmp;
2099 insert_point = tmp;
2100 if (tmp->b_this_page)
2101 tmp = tmp->b_this_page;
2102 else
2103 break;
2105 tmp->b_this_page = bh;
2106 free_list[isize].list = bh;
2107 spin_unlock(&free_list[isize].lock);
2109 page->buffers = bh;
2110 page->flags &= ~(1 << PG_referenced);
2111 lru_cache_add(page);
2112 atomic_inc(&buffermem_pages);
2113 return 1;
2115 no_buffer_head:
2116 page_cache_release(page);
2117 out:
2118 return 0;
2122 * Sync all the buffers on one page..
2124 * If we have old buffers that are locked, we'll
2125 * wait on them, but we won't wait on the new ones
2126 * we're writing out now.
2128 * This all is required so that we can free up memory
2129 * later.
2131 * Wait:
2132 * 0 - no wait (this does not get called - see try_to_free_buffers below)
2133 * 1 - start IO for dirty buffers
2134 * 2 - wait for completion of locked buffers
2136 static void sync_page_buffers(struct buffer_head *bh, int wait)
2138 struct buffer_head * tmp = bh;
2140 do {
2141 struct buffer_head *p = tmp;
2142 tmp = tmp->b_this_page;
2143 if (buffer_locked(p)) {
2144 if (wait > 1)
2145 __wait_on_buffer(p);
2146 } else if (buffer_dirty(p))
2147 ll_rw_block(WRITE, 1, &p);
2148 } while (tmp != bh);
2152 * Can the buffer be thrown out?
2154 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2155 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2158 * try_to_free_buffers() checks if all the buffers on this particular page
2159 * are unused, and free's the page if so.
2161 * Wake up bdflush() if this fails - if we're running low on memory due
2162 * to dirty buffers, we need to flush them out as quickly as possible.
2164 * NOTE: There are quite a number of ways that threads of control can
2165 * obtain a reference to a buffer head within a page. So we must
2166 * lock out all of these paths to cleanly toss the page.
2168 int try_to_free_buffers(struct page * page, int wait)
2170 struct buffer_head * tmp, * bh = page->buffers;
2171 int index = BUFSIZE_INDEX(bh->b_size);
2173 spin_lock(&lru_list_lock);
2174 write_lock(&hash_table_lock);
2175 spin_lock(&free_list[index].lock);
2176 tmp = bh;
2177 do {
2178 struct buffer_head *p = tmp;
2180 tmp = tmp->b_this_page;
2181 if (buffer_busy(p))
2182 goto busy_buffer_page;
2183 } while (tmp != bh);
2185 spin_lock(&unused_list_lock);
2186 tmp = bh;
2187 do {
2188 struct buffer_head * p = tmp;
2189 tmp = tmp->b_this_page;
2191 /* The buffer can be either on the regular
2192 * queues or on the free list..
2194 if (p->b_dev != B_FREE)
2195 __remove_from_queues(p);
2196 else
2197 __remove_from_free_list(p, index);
2198 __put_unused_buffer_head(p);
2199 } while (tmp != bh);
2200 spin_unlock(&unused_list_lock);
2202 /* Wake up anyone waiting for buffer heads */
2203 wake_up(&buffer_wait);
2205 /* And free the page */
2206 page->buffers = NULL;
2207 page_cache_release(page);
2208 spin_unlock(&free_list[index].lock);
2209 write_unlock(&hash_table_lock);
2210 spin_unlock(&lru_list_lock);
2211 return 1;
2213 busy_buffer_page:
2214 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2215 spin_unlock(&free_list[index].lock);
2216 write_unlock(&hash_table_lock);
2217 spin_unlock(&lru_list_lock);
2218 if (wait)
2219 sync_page_buffers(bh, wait);
2220 return 0;
2223 /* ================== Debugging =================== */
2225 void show_buffers(void)
2227 #ifdef CONFIG_SMP
2228 struct buffer_head * bh;
2229 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2230 int protected = 0;
2231 int nlist;
2232 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2233 #endif
2235 printk("Buffer memory: %6dkB\n",
2236 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2238 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2239 if (!spin_trylock(&lru_list_lock))
2240 return;
2241 for(nlist = 0; nlist < NR_LIST; nlist++) {
2242 found = locked = dirty = used = lastused = protected = 0;
2243 bh = lru_list[nlist];
2244 if(!bh) continue;
2246 do {
2247 found++;
2248 if (buffer_locked(bh))
2249 locked++;
2250 if (buffer_protected(bh))
2251 protected++;
2252 if (buffer_dirty(bh))
2253 dirty++;
2254 if (atomic_read(&bh->b_count))
2255 used++, lastused = found;
2256 bh = bh->b_next_free;
2257 } while (bh != lru_list[nlist]);
2259 int tmp = nr_buffers_type[nlist];
2260 if (found != tmp)
2261 printk("%9s: BUG -> found %d, reported %d\n",
2262 buf_types[nlist], found, tmp);
2264 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2265 "%d locked, %d protected, %d dirty\n",
2266 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2267 used, lastused, locked, protected, dirty);
2269 spin_unlock(&lru_list_lock);
2270 #endif
2273 /* ===================== Init ======================= */
2276 * allocate the hash table and init the free list
2277 * Use gfp() for the hash table to decrease TLB misses, use
2278 * SLAB cache for buffer heads.
2280 void __init buffer_init(unsigned long mempages)
2282 int order, i;
2283 unsigned int nr_hash;
2285 /* The buffer cache hash table is less important these days,
2286 * trim it a bit.
2288 mempages >>= 14;
2290 mempages *= sizeof(struct buffer_head *);
2292 for (order = 0; (1 << order) < mempages; order++)
2295 /* try to allocate something until we get it or we're asking
2296 for something that is really too small */
2298 do {
2299 unsigned long tmp;
2301 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2302 bh_hash_mask = (nr_hash - 1);
2304 tmp = nr_hash;
2305 bh_hash_shift = 0;
2306 while((tmp >>= 1UL) != 0UL)
2307 bh_hash_shift++;
2309 hash_table = (struct buffer_head **)
2310 __get_free_pages(GFP_ATOMIC, order);
2311 } while (hash_table == NULL && --order > 0);
2312 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2313 nr_hash, order, (PAGE_SIZE << order));
2315 if (!hash_table)
2316 panic("Failed to allocate buffer hash table\n");
2318 /* Setup hash chains. */
2319 for(i = 0; i < nr_hash; i++)
2320 hash_table[i] = NULL;
2322 /* Setup free lists. */
2323 for(i = 0; i < NR_SIZES; i++) {
2324 free_list[i].list = NULL;
2325 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2328 /* Setup lru lists. */
2329 for(i = 0; i < NR_LIST; i++)
2330 lru_list[i] = NULL;
2335 /* ====================== bdflush support =================== */
2337 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2338 * response to dirty buffers. Once this process is activated, we write back
2339 * a limited number of buffers to the disks and then go back to sleep again.
2341 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2342 struct task_struct *bdflush_tsk = 0;
2344 void wakeup_bdflush(int block)
2346 DECLARE_WAITQUEUE(wait, current);
2348 if (current == bdflush_tsk)
2349 return;
2351 if (!block) {
2352 wake_up_process(bdflush_tsk);
2353 return;
2356 /* kflushd can wakeup us before we have a chance to
2357 go to sleep so we must be smart in handling
2358 this wakeup event from kflushd to avoid deadlocking in SMP
2359 (we are not holding any lock anymore in these two paths). */
2360 __set_current_state(TASK_UNINTERRUPTIBLE);
2361 add_wait_queue(&bdflush_done, &wait);
2363 wake_up_process(bdflush_tsk);
2364 schedule();
2366 remove_wait_queue(&bdflush_done, &wait);
2367 __set_current_state(TASK_RUNNING);
2370 /* This is the _only_ function that deals with flushing async writes
2371 to disk.
2372 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2373 as all dirty buffers lives _only_ in the DIRTY lru list.
2374 As we never browse the LOCKED and CLEAN lru lists they are infact
2375 completly useless. */
2376 static int flush_dirty_buffers(int check_flushtime)
2378 struct buffer_head * bh, *next;
2379 int flushed = 0, i;
2381 restart:
2382 spin_lock(&lru_list_lock);
2383 bh = lru_list[BUF_DIRTY];
2384 if (!bh)
2385 goto out_unlock;
2386 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2387 next = bh->b_next_free;
2389 if (!buffer_dirty(bh)) {
2390 __refile_buffer(bh);
2391 continue;
2393 if (buffer_locked(bh))
2394 continue;
2396 if (check_flushtime) {
2397 /* The dirty lru list is chronologically ordered so
2398 if the current bh is not yet timed out,
2399 then also all the following bhs
2400 will be too young. */
2401 if (time_before(jiffies, bh->b_flushtime))
2402 goto out_unlock;
2403 } else {
2404 if (++flushed > bdf_prm.b_un.ndirty)
2405 goto out_unlock;
2408 /* OK, now we are committed to write it out. */
2409 atomic_inc(&bh->b_count);
2410 spin_unlock(&lru_list_lock);
2411 ll_rw_block(WRITE, 1, &bh);
2412 atomic_dec(&bh->b_count);
2414 if (current->need_resched)
2415 schedule();
2416 goto restart;
2418 out_unlock:
2419 spin_unlock(&lru_list_lock);
2421 return flushed;
2425 * Here we attempt to write back old buffers. We also try to flush inodes
2426 * and supers as well, since this function is essentially "update", and
2427 * otherwise there would be no way of ensuring that these quantities ever
2428 * get written back. Ideally, we would have a timestamp on the inodes
2429 * and superblocks so that we could write back only the old ones as well
2432 static int sync_old_buffers(void)
2434 lock_kernel();
2435 sync_supers(0);
2436 sync_inodes(0);
2437 unlock_kernel();
2439 flush_dirty_buffers(1);
2440 /* must really sync all the active I/O request to disk here */
2441 run_task_queue(&tq_disk);
2442 return 0;
2445 int block_sync_page(struct page *page)
2447 run_task_queue(&tq_disk);
2448 return 0;
2451 /* This is the interface to bdflush. As we get more sophisticated, we can
2452 * pass tuning parameters to this "process", to adjust how it behaves.
2453 * We would want to verify each parameter, however, to make sure that it
2454 * is reasonable. */
2456 asmlinkage long sys_bdflush(int func, long data)
2458 if (!capable(CAP_SYS_ADMIN))
2459 return -EPERM;
2461 if (func == 1) {
2462 /* do_exit directly and let kupdate to do its work alone. */
2463 do_exit(0);
2464 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2465 a syscall that doesn't care about the current mm context. */
2466 int error;
2467 struct mm_struct *user_mm;
2470 * bdflush will spend all of it's time in kernel-space,
2471 * without touching user-space, so we can switch it into
2472 * 'lazy TLB mode' to reduce the cost of context-switches
2473 * to and from bdflush.
2475 user_mm = start_lazy_tlb();
2476 error = sync_old_buffers();
2477 end_lazy_tlb(user_mm);
2478 return error;
2479 #endif
2482 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2483 if (func >= 2) {
2484 int i = (func-2) >> 1;
2485 if (i >= 0 && i < N_PARAM) {
2486 if ((func & 1) == 0)
2487 return put_user(bdf_prm.data[i], (int*)data);
2489 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2490 bdf_prm.data[i] = data;
2491 return 0;
2494 return -EINVAL;
2497 /* Having func 0 used to launch the actual bdflush and then never
2498 * return (unless explicitly killed). We return zero here to
2499 * remain semi-compatible with present update(8) programs.
2501 return 0;
2505 * This is the actual bdflush daemon itself. It used to be started from
2506 * the syscall above, but now we launch it ourselves internally with
2507 * kernel_thread(...) directly after the first thread in init/main.c
2509 int bdflush(void *sem)
2511 struct task_struct *tsk = current;
2512 int flushed;
2514 * We have a bare-bones task_struct, and really should fill
2515 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2516 * display semi-sane things. Not real crucial though...
2519 tsk->session = 1;
2520 tsk->pgrp = 1;
2521 strcpy(tsk->comm, "kflushd");
2522 bdflush_tsk = tsk;
2524 /* avoid getting signals */
2525 spin_lock_irq(&tsk->sigmask_lock);
2526 flush_signals(tsk);
2527 sigfillset(&tsk->blocked);
2528 recalc_sigpending(tsk);
2529 spin_unlock_irq(&tsk->sigmask_lock);
2531 up((struct semaphore *)sem);
2533 for (;;) {
2534 CHECK_EMERGENCY_SYNC
2536 flushed = flush_dirty_buffers(0);
2538 /* If wakeup_bdflush will wakeup us
2539 after our bdflush_done wakeup, then
2540 we must make sure to not sleep
2541 in schedule_timeout otherwise
2542 wakeup_bdflush may wait for our
2543 bdflush_done wakeup that would never arrive
2544 (as we would be sleeping) and so it would
2545 deadlock in SMP. */
2546 __set_current_state(TASK_INTERRUPTIBLE);
2547 wake_up(&bdflush_done);
2549 * If there are still a lot of dirty buffers around,
2550 * skip the sleep and flush some more. Otherwise, we
2551 * go to sleep waiting a wakeup.
2553 if (!flushed || balance_dirty_state(NODEV) < 0)
2554 schedule();
2555 /* Remember to mark us as running otherwise
2556 the next schedule will block. */
2557 __set_current_state(TASK_RUNNING);
2562 * This is the kernel update daemon. It was used to live in userspace
2563 * but since it's need to run safely we want it unkillable by mistake.
2564 * You don't need to change your userspace configuration since
2565 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2567 int kupdate(void *sem)
2569 struct task_struct * tsk = current;
2570 int interval;
2572 tsk->session = 1;
2573 tsk->pgrp = 1;
2574 strcpy(tsk->comm, "kupdate");
2576 /* sigstop and sigcont will stop and wakeup kupdate */
2577 spin_lock_irq(&tsk->sigmask_lock);
2578 sigfillset(&tsk->blocked);
2579 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2580 recalc_sigpending(tsk);
2581 spin_unlock_irq(&tsk->sigmask_lock);
2583 up((struct semaphore *)sem);
2585 for (;;) {
2586 /* update interval */
2587 interval = bdf_prm.b_un.interval;
2588 if (interval) {
2589 tsk->state = TASK_INTERRUPTIBLE;
2590 schedule_timeout(interval);
2591 } else {
2592 stop_kupdate:
2593 tsk->state = TASK_STOPPED;
2594 schedule(); /* wait for SIGCONT */
2596 /* check for sigstop */
2597 if (signal_pending(tsk)) {
2598 int stopped = 0;
2599 spin_lock_irq(&tsk->sigmask_lock);
2600 if (sigismember(&tsk->signal, SIGSTOP)) {
2601 sigdelset(&tsk->signal, SIGSTOP);
2602 stopped = 1;
2604 recalc_sigpending(tsk);
2605 spin_unlock_irq(&tsk->sigmask_lock);
2606 if (stopped)
2607 goto stop_kupdate;
2609 #ifdef DEBUG
2610 printk("kupdate() activated...\n");
2611 #endif
2612 sync_old_buffers();
2616 static int __init bdflush_init(void)
2618 DECLARE_MUTEX_LOCKED(sem);
2619 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2620 down(&sem);
2621 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2622 down(&sem);
2623 return 0;
2626 module_init(bdflush_init)