- pre3:
[davej-history.git] / fs / buffer.c
blobb9120655e7741f5a8d016fb1d8095d04f895c37b
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
53 #define NR_SIZES 7
54 static char buffersize_index[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 6};
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
72 * Hash table gook..
74 static unsigned int bh_hash_mask;
75 static unsigned int bh_hash_shift;
76 static struct buffer_head **hash_table;
77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
79 static struct buffer_head *lru_list[NR_LIST];
80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
81 static int nr_buffers_type[NR_LIST];
82 static unsigned long size_buffers_type[NR_LIST];
84 static struct buffer_head * unused_list;
85 static int nr_unused_buffer_heads;
86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
89 struct bh_free_head {
90 struct buffer_head *list;
91 spinlock_t lock;
93 static struct bh_free_head free_list[NR_SIZES];
95 static int grow_buffers(int size);
96 static void __refile_buffer(struct buffer_head *);
98 /* This is used by some architectures to estimate available memory. */
99 atomic_t buffermem_pages = ATOMIC_INIT(0);
101 /* Here is the parameter block for the bdflush process. If you add or
102 * remove any of the parameters, make sure to update kernel/sysctl.c.
105 #define N_PARAM 9
107 /* The dummy values in this structure are left in there for compatibility
108 * with old programs that play with the /proc entries.
110 union bdflush_param {
111 struct {
112 int nfract; /* Percentage of buffer cache dirty to
113 activate bdflush */
114 int ndirty; /* Maximum number of dirty blocks to write out per
115 wake-cycle */
116 int nrefill; /* Number of clean buffers to try to obtain
117 each time we call refill */
118 int nref_dirt; /* Dirty buffer threshold for activating bdflush
119 when trying to refill buffers. */
120 int interval; /* jiffies delay between kupdate flushes */
121 int age_buffer; /* Time for normal buffer to age before we flush it */
122 int dummy1; /* unused, was age_super */
123 int dummy2; /* unused */
124 int dummy3; /* unused */
125 } b_un;
126 unsigned int data[N_PARAM];
127 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
129 /* These are the min and max parameter values that we will allow to be assigned */
130 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
134 * Rewrote the wait-routines to use the "new" wait-queue functionality,
135 * and getting rid of the cli-sti pairs. The wait-queue routines still
136 * need cli-sti, but now it's just a couple of 386 instructions or so.
138 * Note that the real wait_on_buffer() is an inline function that checks
139 * if 'b_wait' is set before calling this, so that the queues aren't set
140 * up unnecessarily.
142 void __wait_on_buffer(struct buffer_head * bh)
144 struct task_struct *tsk = current;
145 DECLARE_WAITQUEUE(wait, tsk);
147 atomic_inc(&bh->b_count);
148 add_wait_queue(&bh->b_wait, &wait);
149 do {
150 run_task_queue(&tq_disk);
151 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
152 if (!buffer_locked(bh))
153 break;
154 schedule();
155 } while (buffer_locked(bh));
156 tsk->state = TASK_RUNNING;
157 remove_wait_queue(&bh->b_wait, &wait);
158 atomic_dec(&bh->b_count);
161 /* Call sync_buffers with wait!=0 to ensure that the call does not
162 * return until all buffer writes have completed. Sync() may return
163 * before the writes have finished; fsync() may not.
166 /* Godamity-damn. Some buffers (bitmaps for filesystems)
167 * spontaneously dirty themselves without ever brelse being called.
168 * We will ultimately want to put these in a separate list, but for
169 * now we search all of the lists for dirty buffers.
171 static int sync_buffers(kdev_t dev, int wait)
173 int i, retry, pass = 0, err = 0;
174 struct buffer_head * bh, *next;
176 /* One pass for no-wait, three for wait:
177 * 0) write out all dirty, unlocked buffers;
178 * 1) write out all dirty buffers, waiting if locked;
179 * 2) wait for completion by waiting for all buffers to unlock.
181 do {
182 retry = 0;
184 /* We search all lists as a failsafe mechanism, not because we expect
185 * there to be dirty buffers on any of the other lists.
187 repeat:
188 spin_lock(&lru_list_lock);
189 bh = lru_list[BUF_DIRTY];
190 if (!bh)
191 goto repeat2;
193 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
194 next = bh->b_next_free;
196 if (!lru_list[BUF_DIRTY])
197 break;
198 if (dev && bh->b_dev != dev)
199 continue;
200 if (buffer_locked(bh)) {
201 /* Buffer is locked; skip it unless wait is
202 * requested AND pass > 0.
204 if (!wait || !pass) {
205 retry = 1;
206 continue;
208 atomic_inc(&bh->b_count);
209 spin_unlock(&lru_list_lock);
210 wait_on_buffer (bh);
211 atomic_dec(&bh->b_count);
212 goto repeat;
215 /* If an unlocked buffer is not uptodate, there has
216 * been an IO error. Skip it.
218 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
219 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
220 err = -EIO;
221 continue;
224 /* Don't write clean buffers. Don't write ANY buffers
225 * on the third pass.
227 if (!buffer_dirty(bh) || pass >= 2)
228 continue;
230 atomic_inc(&bh->b_count);
231 spin_unlock(&lru_list_lock);
232 ll_rw_block(WRITE, 1, &bh);
233 atomic_dec(&bh->b_count);
234 retry = 1;
235 goto repeat;
238 repeat2:
239 bh = lru_list[BUF_LOCKED];
240 if (!bh) {
241 spin_unlock(&lru_list_lock);
242 break;
244 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
245 next = bh->b_next_free;
247 if (!lru_list[BUF_LOCKED])
248 break;
249 if (dev && bh->b_dev != dev)
250 continue;
251 if (buffer_locked(bh)) {
252 /* Buffer is locked; skip it unless wait is
253 * requested AND pass > 0.
255 if (!wait || !pass) {
256 retry = 1;
257 continue;
259 atomic_inc(&bh->b_count);
260 spin_unlock(&lru_list_lock);
261 wait_on_buffer (bh);
262 spin_lock(&lru_list_lock);
263 atomic_dec(&bh->b_count);
264 goto repeat2;
267 spin_unlock(&lru_list_lock);
269 /* If we are waiting for the sync to succeed, and if any dirty
270 * blocks were written, then repeat; on the second pass, only
271 * wait for buffers being written (do not pass to write any
272 * more buffers on the second pass).
274 } while (wait && retry && ++pass<=2);
275 return err;
278 void sync_dev(kdev_t dev)
280 sync_supers(dev);
281 sync_inodes(dev);
282 DQUOT_SYNC(dev);
283 /* sync all the dirty buffers out to disk only _after_ all the
284 high level layers finished generated buffer dirty data
285 (or we'll return with some buffer still dirty on the blockdevice
286 so breaking the semantics of this call) */
287 sync_buffers(dev, 0);
289 * FIXME(eric) we need to sync the physical devices here.
290 * This is because some (scsi) controllers have huge amounts of
291 * cache onboard (hundreds of Mb), and we need to instruct
292 * them to commit all of the dirty memory to disk, and we should
293 * not return until this has happened.
295 * This would need to get implemented by going through the assorted
296 * layers so that each block major number can be synced, and this
297 * would call down into the upper and mid-layer scsi.
301 int fsync_dev(kdev_t dev)
303 sync_buffers(dev, 0);
305 lock_kernel();
306 sync_supers(dev);
307 sync_inodes(dev);
308 DQUOT_SYNC(dev);
309 unlock_kernel();
311 return sync_buffers(dev, 1);
314 asmlinkage long sys_sync(void)
316 fsync_dev(0);
317 return 0;
321 * filp may be NULL if called via the msync of a vma.
324 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
326 struct inode * inode = dentry->d_inode;
327 struct super_block * sb;
328 kdev_t dev;
329 int ret;
331 lock_kernel();
332 /* sync the inode to buffers */
333 write_inode_now(inode, 0);
335 /* sync the superblock to buffers */
336 sb = inode->i_sb;
337 wait_on_super(sb);
338 if (sb->s_op && sb->s_op->write_super)
339 sb->s_op->write_super(sb);
341 /* .. finally sync the buffers to disk */
342 dev = inode->i_dev;
343 ret = sync_buffers(dev, 1);
344 unlock_kernel();
345 return ret;
348 asmlinkage long sys_fsync(unsigned int fd)
350 struct file * file;
351 struct dentry * dentry;
352 struct inode * inode;
353 int err;
355 err = -EBADF;
356 file = fget(fd);
357 if (!file)
358 goto out;
360 dentry = file->f_dentry;
361 inode = dentry->d_inode;
363 err = -EINVAL;
364 if (!file->f_op || !file->f_op->fsync)
365 goto out_putf;
367 /* We need to protect against concurrent writers.. */
368 down(&inode->i_sem);
369 err = file->f_op->fsync(file, dentry, 0);
370 up(&inode->i_sem);
372 out_putf:
373 fput(file);
374 out:
375 return err;
378 asmlinkage long sys_fdatasync(unsigned int fd)
380 struct file * file;
381 struct dentry * dentry;
382 struct inode * inode;
383 int err;
385 err = -EBADF;
386 file = fget(fd);
387 if (!file)
388 goto out;
390 dentry = file->f_dentry;
391 inode = dentry->d_inode;
393 err = -EINVAL;
394 if (!file->f_op || !file->f_op->fsync)
395 goto out_putf;
397 down(&inode->i_sem);
398 err = file->f_op->fsync(file, dentry, 1);
399 up(&inode->i_sem);
401 out_putf:
402 fput(file);
403 out:
404 return err;
407 /* After several hours of tedious analysis, the following hash
408 * function won. Do not mess with it... -DaveM
410 #define _hashfn(dev,block) \
411 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
412 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
413 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
415 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
417 if ((bh->b_next = *head) != NULL)
418 bh->b_next->b_pprev = &bh->b_next;
419 *head = bh;
420 bh->b_pprev = head;
423 static __inline__ void __hash_unlink(struct buffer_head *bh)
425 if (bh->b_pprev) {
426 if (bh->b_next)
427 bh->b_next->b_pprev = bh->b_pprev;
428 *(bh->b_pprev) = bh->b_next;
429 bh->b_pprev = NULL;
433 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
435 struct buffer_head **bhp = &lru_list[blist];
437 if(!*bhp) {
438 *bhp = bh;
439 bh->b_prev_free = bh;
441 bh->b_next_free = *bhp;
442 bh->b_prev_free = (*bhp)->b_prev_free;
443 (*bhp)->b_prev_free->b_next_free = bh;
444 (*bhp)->b_prev_free = bh;
445 nr_buffers_type[blist]++;
446 size_buffers_type[blist] += bh->b_size;
449 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
451 if (bh->b_prev_free || bh->b_next_free) {
452 bh->b_prev_free->b_next_free = bh->b_next_free;
453 bh->b_next_free->b_prev_free = bh->b_prev_free;
454 if (lru_list[blist] == bh)
455 lru_list[blist] = bh->b_next_free;
456 if (lru_list[blist] == bh)
457 lru_list[blist] = NULL;
458 bh->b_next_free = bh->b_prev_free = NULL;
459 nr_buffers_type[blist]--;
460 size_buffers_type[blist] -= bh->b_size;
464 static void __remove_from_free_list(struct buffer_head * bh, int index)
466 if(bh->b_next_free == bh)
467 free_list[index].list = NULL;
468 else {
469 bh->b_prev_free->b_next_free = bh->b_next_free;
470 bh->b_next_free->b_prev_free = bh->b_prev_free;
471 if (free_list[index].list == bh)
472 free_list[index].list = bh->b_next_free;
474 bh->b_next_free = bh->b_prev_free = NULL;
477 /* must be called with both the hash_table_lock and the lru_list_lock
478 held */
479 static void __remove_from_queues(struct buffer_head *bh)
481 __hash_unlink(bh);
482 __remove_from_lru_list(bh, bh->b_list);
485 static void __insert_into_queues(struct buffer_head *bh)
487 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
489 __hash_link(bh, head);
490 __insert_into_lru_list(bh, bh->b_list);
493 /* This function must only run if there are no other
494 * references _anywhere_ to this buffer head.
496 static void put_last_free(struct buffer_head * bh)
498 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
499 struct buffer_head **bhp = &head->list;
501 bh->b_state = 0;
503 spin_lock(&head->lock);
504 bh->b_dev = B_FREE;
505 if(!*bhp) {
506 *bhp = bh;
507 bh->b_prev_free = bh;
509 bh->b_next_free = *bhp;
510 bh->b_prev_free = (*bhp)->b_prev_free;
511 (*bhp)->b_prev_free->b_next_free = bh;
512 (*bhp)->b_prev_free = bh;
513 spin_unlock(&head->lock);
517 * Why like this, I hear you say... The reason is race-conditions.
518 * As we don't lock buffers (unless we are reading them, that is),
519 * something might happen to it while we sleep (ie a read-error
520 * will force it bad). This shouldn't really happen currently, but
521 * the code is ready.
523 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
525 struct buffer_head *bh = hash(dev, block);
527 for (; bh; bh = bh->b_next)
528 if (bh->b_blocknr == block &&
529 bh->b_size == size &&
530 bh->b_dev == dev)
531 break;
532 if (bh)
533 atomic_inc(&bh->b_count);
535 return bh;
538 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
540 struct buffer_head *bh;
542 read_lock(&hash_table_lock);
543 bh = __get_hash_table(dev, block, size);
544 read_unlock(&hash_table_lock);
546 return bh;
549 unsigned int get_hardblocksize(kdev_t dev)
552 * Get the hard sector size for the given device. If we don't know
553 * what it is, return 0.
555 if (hardsect_size[MAJOR(dev)] != NULL) {
556 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
557 if (blksize != 0)
558 return blksize;
562 * We don't know what the hardware sector size for this device is.
563 * Return 0 indicating that we don't know.
565 return 0;
568 /* If invalidate_buffers() will trash dirty buffers, it means some kind
569 of fs corruption is going on. Trashing dirty data always imply losing
570 information that was supposed to be just stored on the physical layer
571 by the user.
573 Thus invalidate_buffers in general usage is not allwowed to trash dirty
574 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
576 NOTE: In the case where the user removed a removable-media-disk even if
577 there's still dirty data not synced on disk (due a bug in the device driver
578 or due an error of the user), by not destroying the dirty buffers we could
579 generate corruption also on the next media inserted, thus a parameter is
580 necessary to handle this case in the most safe way possible (trying
581 to not corrupt also the new disk inserted with the data belonging to
582 the old now corrupted disk). Also for the ramdisk the natural thing
583 to do in order to release the ramdisk memory is to destroy dirty buffers.
585 These are two special cases. Normal usage imply the device driver
586 to issue a sync on the device (without waiting I/O completation) and
587 then an invalidate_buffers call that doesn't trashes dirty buffers. */
588 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
590 int i, nlist, slept;
591 struct buffer_head * bh, * bh_next;
593 retry:
594 slept = 0;
595 spin_lock(&lru_list_lock);
596 for(nlist = 0; nlist < NR_LIST; nlist++) {
597 bh = lru_list[nlist];
598 if (!bh)
599 continue;
600 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
601 bh_next = bh->b_next_free;
602 if (bh->b_dev != dev)
603 continue;
604 if (buffer_locked(bh)) {
605 atomic_inc(&bh->b_count);
606 spin_unlock(&lru_list_lock);
607 wait_on_buffer(bh);
608 slept = 1;
609 spin_lock(&lru_list_lock);
610 atomic_dec(&bh->b_count);
613 write_lock(&hash_table_lock);
614 if (!atomic_read(&bh->b_count) &&
615 (destroy_dirty_buffers || !buffer_dirty(bh))) {
616 __remove_from_queues(bh);
617 put_last_free(bh);
619 write_unlock(&hash_table_lock);
620 if (slept)
621 goto out;
624 out:
625 spin_unlock(&lru_list_lock);
626 if (slept)
627 goto retry;
630 void set_blocksize(kdev_t dev, int size)
632 extern int *blksize_size[];
633 int i, nlist, slept;
634 struct buffer_head * bh, * bh_next;
636 if (!blksize_size[MAJOR(dev)])
637 return;
639 /* Size must be a power of two, and between 512 and PAGE_SIZE */
640 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
641 panic("Invalid blocksize passed to set_blocksize");
643 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
644 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
645 return;
647 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
648 return;
649 sync_buffers(dev, 2);
650 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
652 retry:
653 slept = 0;
654 spin_lock(&lru_list_lock);
655 for(nlist = 0; nlist < NR_LIST; nlist++) {
656 bh = lru_list[nlist];
657 if (!bh)
658 continue;
659 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
660 bh_next = bh->b_next_free;
661 if (bh->b_dev != dev || bh->b_size == size)
662 continue;
663 if (buffer_locked(bh)) {
664 atomic_inc(&bh->b_count);
665 spin_unlock(&lru_list_lock);
666 wait_on_buffer(bh);
667 slept = 1;
668 spin_lock(&lru_list_lock);
669 atomic_dec(&bh->b_count);
672 write_lock(&hash_table_lock);
673 if (!atomic_read(&bh->b_count)) {
674 if (buffer_dirty(bh))
675 printk(KERN_WARNING
676 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
677 kdevname(dev), bh->b_blocknr, bh->b_size);
678 __remove_from_queues(bh);
679 put_last_free(bh);
680 } else {
681 if (atomic_set_buffer_clean(bh))
682 __refile_buffer(bh);
683 clear_bit(BH_Uptodate, &bh->b_state);
684 printk(KERN_WARNING
685 "set_blocksize: "
686 "b_count %d, dev %s, block %lu, from %p\n",
687 atomic_read(&bh->b_count), bdevname(bh->b_dev),
688 bh->b_blocknr, __builtin_return_address(0));
690 write_unlock(&hash_table_lock);
691 if (slept)
692 goto out;
695 out:
696 spin_unlock(&lru_list_lock);
697 if (slept)
698 goto retry;
702 * We used to try various strange things. Let's not.
704 static void refill_freelist(int size)
706 if (!grow_buffers(size)) {
707 wakeup_bdflush(1);
708 current->policy |= SCHED_YIELD;
709 schedule();
713 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
715 bh->b_list = BUF_CLEAN;
716 bh->b_end_io = handler;
717 bh->b_private = private;
720 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
722 mark_buffer_uptodate(bh, uptodate);
723 unlock_buffer(bh);
726 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
728 mark_buffer_uptodate(bh, uptodate);
729 unlock_buffer(bh);
730 BUG();
733 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
735 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
736 unsigned long flags;
737 struct buffer_head *tmp;
738 struct page *page;
740 mark_buffer_uptodate(bh, uptodate);
742 /* This is a temporary buffer used for page I/O. */
743 page = bh->b_page;
745 if (!uptodate)
746 SetPageError(page);
749 * Be _very_ careful from here on. Bad things can happen if
750 * two buffer heads end IO at almost the same time and both
751 * decide that the page is now completely done.
753 * Async buffer_heads are here only as labels for IO, and get
754 * thrown away once the IO for this page is complete. IO is
755 * deemed complete once all buffers have been visited
756 * (b_count==0) and are now unlocked. We must make sure that
757 * only the _last_ buffer that decrements its count is the one
758 * that unlock the page..
760 spin_lock_irqsave(&page_uptodate_lock, flags);
761 unlock_buffer(bh);
762 atomic_dec(&bh->b_count);
763 tmp = bh->b_this_page;
764 while (tmp != bh) {
765 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
766 goto still_busy;
767 tmp = tmp->b_this_page;
770 /* OK, the async IO on this page is complete. */
771 spin_unlock_irqrestore(&page_uptodate_lock, flags);
774 * if none of the buffers had errors then we can set the
775 * page uptodate:
777 if (!PageError(page))
778 SetPageUptodate(page);
781 * Run the hooks that have to be done when a page I/O has completed.
783 if (PageTestandClearDecrAfter(page))
784 atomic_dec(&nr_async_pages);
786 UnlockPage(page);
788 return;
790 still_busy:
791 spin_unlock_irqrestore(&page_uptodate_lock, flags);
792 return;
796 * Ok, this is getblk, and it isn't very clear, again to hinder
797 * race-conditions. Most of the code is seldom used, (ie repeating),
798 * so it should be much more efficient than it looks.
800 * The algorithm is changed: hopefully better, and an elusive bug removed.
802 * 14.02.92: changed it to sync dirty buffers a bit: better performance
803 * when the filesystem starts to get full of dirty blocks (I hope).
805 struct buffer_head * getblk(kdev_t dev, int block, int size)
807 struct buffer_head * bh;
808 int isize;
810 repeat:
811 spin_lock(&lru_list_lock);
812 write_lock(&hash_table_lock);
813 bh = __get_hash_table(dev, block, size);
814 if (bh)
815 goto out;
817 isize = BUFSIZE_INDEX(size);
818 spin_lock(&free_list[isize].lock);
819 bh = free_list[isize].list;
820 if (bh) {
821 __remove_from_free_list(bh, isize);
822 atomic_set(&bh->b_count, 1);
824 spin_unlock(&free_list[isize].lock);
827 * OK, FINALLY we know that this buffer is the only one of
828 * its kind, we hold a reference (b_count>0), it is unlocked,
829 * and it is clean.
831 if (bh) {
832 init_buffer(bh, end_buffer_io_sync, NULL);
833 bh->b_dev = dev;
834 bh->b_blocknr = block;
835 bh->b_state = 1 << BH_Mapped;
837 /* Insert the buffer into the regular lists */
838 __insert_into_queues(bh);
839 out:
840 write_unlock(&hash_table_lock);
841 spin_unlock(&lru_list_lock);
842 touch_buffer(bh);
843 return bh;
847 * If we block while refilling the free list, somebody may
848 * create the buffer first ... search the hashes again.
850 write_unlock(&hash_table_lock);
851 spin_unlock(&lru_list_lock);
852 refill_freelist(size);
853 goto repeat;
856 /* -1 -> no need to flush
857 0 -> async flush
858 1 -> sync flush (wait for I/O completation) */
859 static int balance_dirty_state(kdev_t dev)
861 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
863 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
864 tot = nr_free_buffer_pages();
865 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
867 dirty *= 200;
868 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
869 hard_dirty_limit = soft_dirty_limit * 2;
871 if (dirty > soft_dirty_limit) {
872 if (dirty > hard_dirty_limit)
873 return 1;
874 return 0;
876 return -1;
880 * if a new dirty buffer is created we need to balance bdflush.
882 * in the future we might want to make bdflush aware of different
883 * pressures on different devices - thus the (currently unused)
884 * 'dev' parameter.
886 void balance_dirty(kdev_t dev)
888 int state = balance_dirty_state(dev);
890 if (state < 0)
891 return;
892 wakeup_bdflush(state);
895 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
897 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
898 refile_buffer(bh);
901 /* atomic version, the user must call balance_dirty() by hand
902 as soon as it become possible to block */
903 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
905 if (!atomic_set_buffer_dirty(bh))
906 __mark_dirty(bh, flag);
909 void mark_buffer_dirty(struct buffer_head *bh, int flag)
911 __mark_buffer_dirty(bh, flag);
912 balance_dirty(bh->b_dev);
916 * A buffer may need to be moved from one buffer list to another
917 * (e.g. in case it is not shared any more). Handle this.
919 static void __refile_buffer(struct buffer_head *bh)
921 int dispose = BUF_CLEAN;
922 if (buffer_locked(bh))
923 dispose = BUF_LOCKED;
924 if (buffer_dirty(bh))
925 dispose = BUF_DIRTY;
926 if (buffer_protected(bh))
927 dispose = BUF_PROTECTED;
928 if (dispose != bh->b_list) {
929 __remove_from_lru_list(bh, bh->b_list);
930 bh->b_list = dispose;
931 __insert_into_lru_list(bh, dispose);
935 void refile_buffer(struct buffer_head *bh)
937 spin_lock(&lru_list_lock);
938 __refile_buffer(bh);
939 spin_unlock(&lru_list_lock);
943 * Release a buffer head
945 void __brelse(struct buffer_head * buf)
947 if (atomic_read(&buf->b_count)) {
948 atomic_dec(&buf->b_count);
949 return;
951 printk("VFS: brelse: Trying to free free buffer\n");
955 * bforget() is like brelse(), except it puts the buffer on the
956 * free list if it can.. We can NOT free the buffer if:
957 * - there are other users of it
958 * - it is locked and thus can have active IO
960 void __bforget(struct buffer_head * buf)
962 /* grab the lru lock here to block bdflush. */
963 spin_lock(&lru_list_lock);
964 write_lock(&hash_table_lock);
965 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
966 goto in_use;
967 __hash_unlink(buf);
968 write_unlock(&hash_table_lock);
969 __remove_from_lru_list(buf, buf->b_list);
970 spin_unlock(&lru_list_lock);
971 put_last_free(buf);
972 return;
974 in_use:
975 write_unlock(&hash_table_lock);
976 spin_unlock(&lru_list_lock);
980 * bread() reads a specified block and returns the buffer that contains
981 * it. It returns NULL if the block was unreadable.
983 struct buffer_head * bread(kdev_t dev, int block, int size)
985 struct buffer_head * bh;
987 bh = getblk(dev, block, size);
988 if (buffer_uptodate(bh))
989 return bh;
990 ll_rw_block(READ, 1, &bh);
991 wait_on_buffer(bh);
992 if (buffer_uptodate(bh))
993 return bh;
994 brelse(bh);
995 return NULL;
999 * Ok, breada can be used as bread, but additionally to mark other
1000 * blocks for reading as well. End the argument list with a negative
1001 * number.
1004 #define NBUF 16
1006 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1007 unsigned int pos, unsigned int filesize)
1009 struct buffer_head * bhlist[NBUF];
1010 unsigned int blocks;
1011 struct buffer_head * bh;
1012 int index;
1013 int i, j;
1015 if (pos >= filesize)
1016 return NULL;
1018 if (block < 0)
1019 return NULL;
1021 bh = getblk(dev, block, bufsize);
1022 index = BUFSIZE_INDEX(bh->b_size);
1024 if (buffer_uptodate(bh))
1025 return(bh);
1026 else ll_rw_block(READ, 1, &bh);
1028 blocks = (filesize - pos) >> (9+index);
1030 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1031 blocks = read_ahead[MAJOR(dev)] >> index;
1032 if (blocks > NBUF)
1033 blocks = NBUF;
1035 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1037 bhlist[0] = bh;
1038 j = 1;
1039 for(i=1; i<blocks; i++) {
1040 bh = getblk(dev,block+i,bufsize);
1041 if (buffer_uptodate(bh)) {
1042 brelse(bh);
1043 break;
1045 else bhlist[j++] = bh;
1048 /* Request the read for these buffers, and then release them. */
1049 if (j>1)
1050 ll_rw_block(READA, (j-1), bhlist+1);
1051 for(i=1; i<j; i++)
1052 brelse(bhlist[i]);
1054 /* Wait for this buffer, and then continue on. */
1055 bh = bhlist[0];
1056 wait_on_buffer(bh);
1057 if (buffer_uptodate(bh))
1058 return bh;
1059 brelse(bh);
1060 return NULL;
1064 * Note: the caller should wake up the buffer_wait list if needed.
1066 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1068 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1069 kmem_cache_free(bh_cachep, bh);
1070 } else {
1071 bh->b_blocknr = -1;
1072 init_waitqueue_head(&bh->b_wait);
1073 nr_unused_buffer_heads++;
1074 bh->b_next_free = unused_list;
1075 bh->b_this_page = NULL;
1076 unused_list = bh;
1081 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1082 * no-buffer-head deadlock. Return NULL on failure; waiting for
1083 * buffer heads is now handled in create_buffers().
1085 static struct buffer_head * get_unused_buffer_head(int async)
1087 struct buffer_head * bh;
1089 spin_lock(&unused_list_lock);
1090 if (nr_unused_buffer_heads > NR_RESERVED) {
1091 bh = unused_list;
1092 unused_list = bh->b_next_free;
1093 nr_unused_buffer_heads--;
1094 spin_unlock(&unused_list_lock);
1095 return bh;
1097 spin_unlock(&unused_list_lock);
1099 /* This is critical. We can't swap out pages to get
1100 * more buffer heads, because the swap-out may need
1101 * more buffer-heads itself. Thus SLAB_BUFFER.
1103 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1104 memset(bh, 0, sizeof(*bh));
1105 init_waitqueue_head(&bh->b_wait);
1106 return bh;
1110 * If we need an async buffer, use the reserved buffer heads.
1112 if (async) {
1113 spin_lock(&unused_list_lock);
1114 if (unused_list) {
1115 bh = unused_list;
1116 unused_list = bh->b_next_free;
1117 nr_unused_buffer_heads--;
1118 spin_unlock(&unused_list_lock);
1119 return bh;
1121 spin_unlock(&unused_list_lock);
1123 #if 0
1125 * (Pending further analysis ...)
1126 * Ordinary (non-async) requests can use a different memory priority
1127 * to free up pages. Any swapping thus generated will use async
1128 * buffer heads.
1130 if(!async &&
1131 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1132 memset(bh, 0, sizeof(*bh));
1133 init_waitqueue_head(&bh->b_wait);
1134 return bh;
1136 #endif
1138 return NULL;
1141 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1143 bh->b_page = page;
1144 if (offset >= PAGE_SIZE)
1145 BUG();
1146 if (PageHighMem(page))
1148 * This catches illegal uses and preserves the offset:
1150 bh->b_data = (char *)(0 + offset);
1151 else
1152 bh->b_data = page_address(page) + offset;
1156 * Create the appropriate buffers when given a page for data area and
1157 * the size of each buffer.. Use the bh->b_this_page linked list to
1158 * follow the buffers created. Return NULL if unable to create more
1159 * buffers.
1160 * The async flag is used to differentiate async IO (paging, swapping)
1161 * from ordinary buffer allocations, and only async requests are allowed
1162 * to sleep waiting for buffer heads.
1164 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1166 struct buffer_head *bh, *head;
1167 long offset;
1169 try_again:
1170 head = NULL;
1171 offset = PAGE_SIZE;
1172 while ((offset -= size) >= 0) {
1173 bh = get_unused_buffer_head(async);
1174 if (!bh)
1175 goto no_grow;
1177 bh->b_dev = B_FREE; /* Flag as unused */
1178 bh->b_this_page = head;
1179 head = bh;
1181 bh->b_state = 0;
1182 bh->b_next_free = NULL;
1183 bh->b_pprev = NULL;
1184 atomic_set(&bh->b_count, 0);
1185 bh->b_size = size;
1187 set_bh_page(bh, page, offset);
1189 bh->b_list = BUF_CLEAN;
1190 bh->b_end_io = end_buffer_io_bad;
1192 return head;
1194 * In case anything failed, we just free everything we got.
1196 no_grow:
1197 if (head) {
1198 spin_lock(&unused_list_lock);
1199 do {
1200 bh = head;
1201 head = head->b_this_page;
1202 __put_unused_buffer_head(bh);
1203 } while (head);
1204 spin_unlock(&unused_list_lock);
1206 /* Wake up any waiters ... */
1207 wake_up(&buffer_wait);
1211 * Return failure for non-async IO requests. Async IO requests
1212 * are not allowed to fail, so we have to wait until buffer heads
1213 * become available. But we don't want tasks sleeping with
1214 * partially complete buffers, so all were released above.
1216 if (!async)
1217 return NULL;
1219 /* We're _really_ low on memory. Now we just
1220 * wait for old buffer heads to become free due to
1221 * finishing IO. Since this is an async request and
1222 * the reserve list is empty, we're sure there are
1223 * async buffer heads in use.
1225 run_task_queue(&tq_disk);
1228 * Set our state for sleeping, then check again for buffer heads.
1229 * This ensures we won't miss a wake_up from an interrupt.
1231 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1232 goto try_again;
1235 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1237 struct buffer_head *head, *bh, *tail;
1238 int block;
1240 if (!PageLocked(page))
1241 BUG();
1243 * Allocate async buffer heads pointing to this page, just for I/O.
1244 * They don't show up in the buffer hash table, but they *are*
1245 * registered in page->buffers.
1247 head = create_buffers(page, size, 1);
1248 if (page->buffers)
1249 BUG();
1250 if (!head)
1251 BUG();
1252 tail = head;
1253 for (bh = head; bh; bh = bh->b_this_page) {
1254 block = *(b++);
1256 tail = bh;
1257 init_buffer(bh, end_buffer_io_async, NULL);
1258 bh->b_dev = dev;
1259 bh->b_blocknr = block;
1261 set_bit(BH_Mapped, &bh->b_state);
1263 tail->b_this_page = head;
1264 page_cache_get(page);
1265 page->buffers = head;
1266 return 0;
1269 static void unmap_buffer(struct buffer_head * bh)
1271 if (buffer_mapped(bh)) {
1272 mark_buffer_clean(bh);
1273 wait_on_buffer(bh);
1274 clear_bit(BH_Uptodate, &bh->b_state);
1275 clear_bit(BH_Mapped, &bh->b_state);
1276 clear_bit(BH_Req, &bh->b_state);
1277 clear_bit(BH_New, &bh->b_state);
1282 * We don't have to release all buffers here, but
1283 * we have to be sure that no dirty buffer is left
1284 * and no IO is going on (no buffer is locked), because
1285 * we have truncated the file and are going to free the
1286 * blocks on-disk..
1288 int block_flushpage(struct page *page, unsigned long offset)
1290 struct buffer_head *head, *bh, *next;
1291 unsigned int curr_off = 0;
1293 if (!PageLocked(page))
1294 BUG();
1295 if (!page->buffers)
1296 return 1;
1298 head = page->buffers;
1299 bh = head;
1300 do {
1301 unsigned int next_off = curr_off + bh->b_size;
1302 next = bh->b_this_page;
1305 * is this block fully flushed?
1307 if (offset <= curr_off)
1308 unmap_buffer(bh);
1309 curr_off = next_off;
1310 bh = next;
1311 } while (bh != head);
1314 * subtle. We release buffer-heads only if this is
1315 * the 'final' flushpage. We have invalidated the get_block
1316 * cached value unconditionally, so real IO is not
1317 * possible anymore.
1319 * If the free doesn't work out, the buffers can be
1320 * left around - they just turn into anonymous buffers
1321 * instead.
1323 if (!offset) {
1324 if (!try_to_free_buffers(page, 0)) {
1325 atomic_inc(&buffermem_pages);
1326 return 0;
1330 return 1;
1333 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1335 struct buffer_head *bh, *head, *tail;
1337 head = create_buffers(page, blocksize, 1);
1338 if (page->buffers)
1339 BUG();
1341 bh = head;
1342 do {
1343 bh->b_dev = inode->i_dev;
1344 bh->b_blocknr = 0;
1345 bh->b_end_io = end_buffer_io_bad;
1346 tail = bh;
1347 bh = bh->b_this_page;
1348 } while (bh);
1349 tail->b_this_page = head;
1350 page->buffers = head;
1351 page_cache_get(page);
1355 * We are taking a block for data and we don't want any output from any
1356 * buffer-cache aliases starting from return from that function and
1357 * until the moment when something will explicitly mark the buffer
1358 * dirty (hopefully that will not happen until we will free that block ;-)
1359 * We don't even need to mark it not-uptodate - nobody can expect
1360 * anything from a newly allocated buffer anyway. We used to used
1361 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1362 * don't want to mark the alias unmapped, for example - it would confuse
1363 * anyone who might pick it with bread() afterwards...
1366 static void unmap_underlying_metadata(struct buffer_head * bh)
1368 struct buffer_head *old_bh;
1370 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1371 if (old_bh) {
1372 mark_buffer_clean(old_bh);
1373 wait_on_buffer(old_bh);
1374 clear_bit(BH_Req, &old_bh->b_state);
1375 /* Here we could run brelse or bforget. We use
1376 bforget because it will try to put the buffer
1377 in the freelist. */
1378 __bforget(old_bh);
1383 * block_write_full_page() is SMP-safe - currently it's still
1384 * being called with the kernel lock held, but the code is ready.
1386 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1388 int err, i, need_balance_dirty = 0;
1389 unsigned long block;
1390 struct buffer_head *bh, *head;
1392 if (!PageLocked(page))
1393 BUG();
1395 if (!page->buffers)
1396 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1397 head = page->buffers;
1399 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1401 bh = head;
1402 i = 0;
1403 do {
1405 * If the buffer isn't up-to-date, we can't be sure
1406 * that the buffer has been initialized with the proper
1407 * block number information etc..
1409 * Leave it to the low-level FS to make all those
1410 * decisions (block #0 may actually be a valid block)
1412 bh->b_end_io = end_buffer_io_sync;
1413 if (!buffer_mapped(bh)) {
1414 err = get_block(inode, block, bh, 1);
1415 if (err)
1416 goto out;
1417 if (buffer_new(bh))
1418 unmap_underlying_metadata(bh);
1420 set_bit(BH_Uptodate, &bh->b_state);
1421 if (!atomic_set_buffer_dirty(bh)) {
1422 __mark_dirty(bh, 0);
1423 need_balance_dirty = 1;
1426 bh = bh->b_this_page;
1427 block++;
1428 } while (bh != head);
1430 if (need_balance_dirty)
1431 balance_dirty(bh->b_dev);
1433 SetPageUptodate(page);
1434 return 0;
1435 out:
1436 ClearPageUptodate(page);
1437 return err;
1440 static int __block_prepare_write(struct inode *inode, struct page *page,
1441 unsigned from, unsigned to, get_block_t *get_block)
1443 unsigned block_start, block_end;
1444 unsigned long block;
1445 int err = 0;
1446 unsigned blocksize, bbits;
1447 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1448 char *kaddr = (char *)kmap(page);
1450 blocksize = inode->i_sb->s_blocksize;
1451 if (!page->buffers)
1452 create_empty_buffers(page, inode, blocksize);
1453 head = page->buffers;
1455 bbits = inode->i_sb->s_blocksize_bits;
1456 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1458 for(bh = head, block_start = 0; bh != head || !block_start;
1459 block++, block_start=block_end, bh = bh->b_this_page) {
1460 if (!bh)
1461 BUG();
1462 block_end = block_start+blocksize;
1463 if (block_end <= from)
1464 continue;
1465 if (block_start >= to)
1466 break;
1467 bh->b_end_io = end_buffer_io_sync;
1468 if (!buffer_mapped(bh)) {
1469 err = get_block(inode, block, bh, 1);
1470 if (err)
1471 goto out;
1472 if (buffer_new(bh)) {
1473 unmap_underlying_metadata(bh);
1474 if (block_end > to)
1475 memset(kaddr+to, 0, block_end-to);
1476 if (block_start < from)
1477 memset(kaddr+block_start, 0, from-block_start);
1478 if (block_end > to || block_start < from)
1479 flush_dcache_page(page);
1480 continue;
1483 if (!buffer_uptodate(bh) &&
1484 (block_start < from || block_end > to)) {
1485 ll_rw_block(READ, 1, &bh);
1486 *wait_bh++=bh;
1490 * If we issued read requests - let them complete.
1492 while(wait_bh > wait) {
1493 wait_on_buffer(*--wait_bh);
1494 err = -EIO;
1495 if (!buffer_uptodate(*wait_bh))
1496 goto out;
1498 return 0;
1499 out:
1500 return err;
1503 static int __block_commit_write(struct inode *inode, struct page *page,
1504 unsigned from, unsigned to)
1506 unsigned block_start, block_end;
1507 int partial = 0, need_balance_dirty = 0;
1508 unsigned blocksize;
1509 struct buffer_head *bh, *head;
1511 blocksize = inode->i_sb->s_blocksize;
1513 for(bh = head = page->buffers, block_start = 0;
1514 bh != head || !block_start;
1515 block_start=block_end, bh = bh->b_this_page) {
1516 block_end = block_start + blocksize;
1517 if (block_end <= from || block_start >= to) {
1518 if (!buffer_uptodate(bh))
1519 partial = 1;
1520 } else {
1521 set_bit(BH_Uptodate, &bh->b_state);
1522 if (!atomic_set_buffer_dirty(bh)) {
1523 __mark_dirty(bh, 0);
1524 need_balance_dirty = 1;
1529 if (need_balance_dirty)
1530 balance_dirty(bh->b_dev);
1532 * is this a partial write that happened to make all buffers
1533 * uptodate then we can optimize away a bogus readpage() for
1534 * the next read(). Here we 'discover' wether the page went
1535 * uptodate as a result of this (potentially partial) write.
1537 if (!partial)
1538 SetPageUptodate(page);
1539 return 0;
1543 * Generic "read page" function for block devices that have the normal
1544 * get_block functionality. This is most of the block device filesystems.
1545 * Reads the page asynchronously --- the unlock_buffer() and
1546 * mark_buffer_uptodate() functions propagate buffer state into the
1547 * page struct once IO has completed.
1549 int block_read_full_page(struct page *page, get_block_t *get_block)
1551 struct inode *inode = (struct inode*)page->mapping->host;
1552 unsigned long iblock, lblock;
1553 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1554 unsigned int blocksize, blocks;
1555 unsigned long kaddr = 0;
1556 int nr, i;
1558 if (!PageLocked(page))
1559 PAGE_BUG(page);
1560 blocksize = inode->i_sb->s_blocksize;
1561 if (!page->buffers)
1562 create_empty_buffers(page, inode, blocksize);
1563 head = page->buffers;
1565 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1566 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1567 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1568 bh = head;
1569 nr = 0;
1570 i = 0;
1572 do {
1573 if (buffer_uptodate(bh))
1574 continue;
1576 if (!buffer_mapped(bh)) {
1577 if (iblock < lblock)
1578 get_block(inode, iblock, bh, 0);
1579 if (!buffer_mapped(bh)) {
1580 if (!kaddr)
1581 kaddr = kmap(page);
1582 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1583 flush_dcache_page(page);
1584 set_bit(BH_Uptodate, &bh->b_state);
1585 continue;
1589 init_buffer(bh, end_buffer_io_async, NULL);
1590 atomic_inc(&bh->b_count);
1591 arr[nr] = bh;
1592 nr++;
1593 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1595 if (nr) {
1596 if (Page_Uptodate(page))
1597 BUG();
1598 ll_rw_block(READ, nr, arr);
1599 } else {
1601 * all buffers are uptodate - we can set the page
1602 * uptodate as well.
1604 SetPageUptodate(page);
1605 UnlockPage(page);
1607 if (kaddr)
1608 kunmap(page);
1609 return 0;
1613 * For moronic filesystems that do not allow holes in file.
1614 * We may have to extend the file.
1617 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1619 struct address_space *mapping = page->mapping;
1620 struct inode *inode = (struct inode*)mapping->host;
1621 struct page *new_page;
1622 unsigned long pgpos;
1623 long status;
1624 unsigned zerofrom;
1625 unsigned blocksize = inode->i_sb->s_blocksize;
1626 char *kaddr;
1628 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1629 status = -ENOMEM;
1630 new_page = grab_cache_page(mapping, pgpos);
1631 if (!new_page)
1632 goto out;
1633 /* we might sleep */
1634 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1635 UnlockPage(new_page);
1636 page_cache_release(new_page);
1637 continue;
1639 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1640 if (zerofrom & (blocksize-1)) {
1641 *bytes |= (blocksize-1);
1642 (*bytes)++;
1644 status = __block_prepare_write(inode, new_page, zerofrom,
1645 PAGE_CACHE_SIZE, get_block);
1646 if (status)
1647 goto out_unmap;
1648 kaddr = page_address(new_page);
1649 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1650 flush_dcache_page(new_page);
1651 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1652 kunmap(new_page);
1653 UnlockPage(new_page);
1654 page_cache_release(new_page);
1657 if (page->index < pgpos) {
1658 /* completely inside the area */
1659 zerofrom = offset;
1660 } else {
1661 /* page covers the boundary, find the boundary offset */
1662 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1664 /* if we will expand the thing last block will be filled */
1665 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1666 *bytes |= (blocksize-1);
1667 (*bytes)++;
1670 /* starting below the boundary? Nothing to zero out */
1671 if (offset <= zerofrom)
1672 zerofrom = offset;
1674 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1675 if (status)
1676 goto out1;
1677 kaddr = page_address(page);
1678 if (zerofrom < offset) {
1679 memset(kaddr+zerofrom, 0, offset-zerofrom);
1680 flush_dcache_page(page);
1681 __block_commit_write(inode, page, zerofrom, offset);
1683 return 0;
1684 out1:
1685 ClearPageUptodate(page);
1686 kunmap(page);
1687 return status;
1689 out_unmap:
1690 ClearPageUptodate(new_page);
1691 kunmap(new_page);
1692 UnlockPage(new_page);
1693 page_cache_release(new_page);
1694 out:
1695 return status;
1698 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1699 get_block_t *get_block)
1701 struct inode *inode = (struct inode*)page->mapping->host;
1702 int err = __block_prepare_write(inode, page, from, to, get_block);
1703 if (err) {
1704 ClearPageUptodate(page);
1705 kunmap(page);
1707 return err;
1710 int generic_commit_write(struct file *file, struct page *page,
1711 unsigned from, unsigned to)
1713 struct inode *inode = (struct inode*)page->mapping->host;
1714 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1715 __block_commit_write(inode,page,from,to);
1716 kunmap(page);
1717 if (pos > inode->i_size) {
1718 inode->i_size = pos;
1719 mark_inode_dirty(inode);
1721 return 0;
1725 * If it would be '74 that would go into libc...
1727 int mem_is_zero(char *p, unsigned len)
1729 while (len--)
1730 if (*p++)
1731 return 0;
1732 return 1;
1735 int block_zero_page(struct address_space *mapping, loff_t from, unsigned length)
1737 unsigned long index = from >> PAGE_CACHE_SHIFT;
1738 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1739 struct inode *inode = (struct inode *)mapping->host;
1740 struct page *page;
1741 char *kaddr;
1742 int err;
1744 if (!length)
1745 return 0;
1747 page = read_cache_page(mapping, index,
1748 (filler_t *)mapping->a_ops->readpage, NULL);
1749 err = PTR_ERR(page);
1750 if (ERR_PTR(page))
1751 goto out;
1752 lock_page(page);
1753 err = -EIO;
1754 if (!Page_Uptodate(page))
1755 goto unlock;
1756 kaddr = (char*)kmap(page);
1757 err = 0;
1758 if (mem_is_zero(kaddr+offset, length))
1759 goto unmap;
1760 memset(kaddr+offset, 0, length);
1761 flush_dcache_page(page);
1762 __block_commit_write(inode, page, offset, offset+length);
1763 unmap:
1764 kunmap(page);
1765 unlock:
1766 UnlockPage(page);
1767 page_cache_release(page);
1768 out:
1769 return err;
1772 int block_write_full_page(struct page *page, get_block_t *get_block)
1774 struct inode *inode = (struct inode*)page->mapping->host;
1775 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1776 unsigned offset;
1777 int err;
1779 /* easy case */
1780 if (page->index < end_index)
1781 return __block_write_full_page(inode, page, get_block);
1783 /* things got complicated... */
1784 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1785 /* OK, are we completely out? */
1786 if (page->index >= end_index+1 || !offset)
1787 return -EIO;
1788 /* Sigh... will have to work, then... */
1789 err = __block_prepare_write(inode, page, 0, offset, get_block);
1790 if (!err) {
1791 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
1792 flush_dcache_page(page);
1793 __block_commit_write(inode,page,0,offset);
1794 done:
1795 kunmap(page);
1796 return err;
1798 ClearPageUptodate(page);
1799 goto done;
1802 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1804 struct buffer_head tmp;
1805 struct inode *inode = (struct inode*)mapping->host;
1806 tmp.b_state = 0;
1807 tmp.b_blocknr = 0;
1808 get_block(inode, block, &tmp, 0);
1809 return tmp.b_blocknr;
1813 * IO completion routine for a buffer_head being used for kiobuf IO: we
1814 * can't dispatch the kiobuf callback until io_count reaches 0.
1817 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1819 struct kiobuf *kiobuf;
1821 mark_buffer_uptodate(bh, uptodate);
1823 kiobuf = bh->b_private;
1824 unlock_buffer(bh);
1825 end_kio_request(kiobuf, uptodate);
1830 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1831 * for them to complete. Clean up the buffer_heads afterwards.
1834 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
1836 int iosize;
1837 int i;
1838 struct buffer_head *tmp;
1841 iosize = 0;
1842 spin_lock(&unused_list_lock);
1844 for (i = nr; --i >= 0; ) {
1845 iosize += size;
1846 tmp = bh[i];
1847 if (buffer_locked(tmp)) {
1848 spin_unlock(&unused_list_lock);
1849 wait_on_buffer(tmp);
1850 spin_lock(&unused_list_lock);
1853 if (!buffer_uptodate(tmp)) {
1854 /* We are traversing bh'es in reverse order so
1855 clearing iosize on error calculates the
1856 amount of IO before the first error. */
1857 iosize = 0;
1859 __put_unused_buffer_head(tmp);
1862 spin_unlock(&unused_list_lock);
1864 return iosize;
1868 * Start I/O on a physical range of kernel memory, defined by a vector
1869 * of kiobuf structs (much like a user-space iovec list).
1871 * The kiobuf must already be locked for IO. IO is submitted
1872 * asynchronously: you need to check page->locked, page->uptodate, and
1873 * maybe wait on page->wait.
1875 * It is up to the caller to make sure that there are enough blocks
1876 * passed in to completely map the iobufs to disk.
1879 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1880 kdev_t dev, unsigned long b[], int size)
1882 int err;
1883 int length;
1884 int transferred;
1885 int i;
1886 int bufind;
1887 int pageind;
1888 int bhind;
1889 int offset;
1890 int sectors = size>>9;
1891 unsigned long blocknr;
1892 struct kiobuf * iobuf = NULL;
1893 struct page * map;
1894 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1896 if (!nr)
1897 return 0;
1900 * First, do some alignment and validity checks
1902 for (i = 0; i < nr; i++) {
1903 iobuf = iovec[i];
1904 if ((iobuf->offset & (size-1)) ||
1905 (iobuf->length & (size-1)))
1906 return -EINVAL;
1907 if (!iobuf->nr_pages)
1908 panic("brw_kiovec: iobuf not initialised");
1912 * OK to walk down the iovec doing page IO on each page we find.
1914 bufind = bhind = transferred = err = 0;
1915 for (i = 0; i < nr; i++) {
1916 iobuf = iovec[i];
1917 offset = iobuf->offset;
1918 length = iobuf->length;
1919 iobuf->errno = 0;
1921 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1922 map = iobuf->maplist[pageind];
1923 if (!map) {
1924 err = -EFAULT;
1925 goto error;
1928 while (length > 0) {
1929 blocknr = b[bufind++];
1930 tmp = get_unused_buffer_head(0);
1931 if (!tmp) {
1932 err = -ENOMEM;
1933 goto error;
1936 tmp->b_dev = B_FREE;
1937 tmp->b_size = size;
1938 set_bh_page(tmp, map, offset);
1939 tmp->b_this_page = tmp;
1941 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1942 tmp->b_rdev = tmp->b_dev = dev;
1943 tmp->b_blocknr = blocknr;
1944 tmp->b_rsector = blocknr*sectors;
1945 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
1947 if (rw == WRITE) {
1948 set_bit(BH_Uptodate, &tmp->b_state);
1949 set_bit(BH_Dirty, &tmp->b_state);
1952 bh[bhind++] = tmp;
1953 length -= size;
1954 offset += size;
1956 atomic_inc(&iobuf->io_count);
1958 generic_make_request(rw, tmp);
1960 * Wait for IO if we have got too much
1962 if (bhind >= KIO_MAX_SECTORS) {
1963 err = wait_kio(rw, bhind, bh, size);
1964 if (err >= 0)
1965 transferred += err;
1966 else
1967 goto finished;
1968 bhind = 0;
1971 if (offset >= PAGE_SIZE) {
1972 offset = 0;
1973 break;
1975 } /* End of block loop */
1976 } /* End of page loop */
1977 } /* End of iovec loop */
1979 /* Is there any IO still left to submit? */
1980 if (bhind) {
1981 err = wait_kio(rw, bhind, bh, size);
1982 if (err >= 0)
1983 transferred += err;
1984 else
1985 goto finished;
1988 finished:
1989 if (transferred)
1990 return transferred;
1991 return err;
1993 error:
1994 /* We got an error allocating the bh'es. Just free the current
1995 buffer_heads and exit. */
1996 spin_lock(&unused_list_lock);
1997 for (i = bhind; --i >= 0; ) {
1998 __put_unused_buffer_head(bh[bhind]);
2000 spin_unlock(&unused_list_lock);
2001 goto finished;
2005 * Start I/O on a page.
2006 * This function expects the page to be locked and may return
2007 * before I/O is complete. You then have to check page->locked,
2008 * page->uptodate, and maybe wait on page->wait.
2010 * brw_page() is SMP-safe, although it's being called with the
2011 * kernel lock held - but the code is ready.
2013 * FIXME: we need a swapper_inode->get_block function to remove
2014 * some of the bmap kludges and interface ugliness here.
2016 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2018 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2019 int nr, fresh /* temporary debugging flag */, block;
2021 if (!PageLocked(page))
2022 panic("brw_page: page not locked for I/O");
2023 // ClearPageError(page);
2025 * We pretty much rely on the page lock for this, because
2026 * create_page_buffers() might sleep.
2028 fresh = 0;
2029 if (!page->buffers) {
2030 create_page_buffers(rw, page, dev, b, size);
2031 fresh = 1;
2033 if (!page->buffers)
2034 BUG();
2036 head = page->buffers;
2037 bh = head;
2038 nr = 0;
2039 do {
2040 block = *(b++);
2042 if (fresh && (atomic_read(&bh->b_count) != 0))
2043 BUG();
2044 if (rw == READ) {
2045 if (!fresh)
2046 BUG();
2047 if (!buffer_uptodate(bh)) {
2048 arr[nr++] = bh;
2049 atomic_inc(&bh->b_count);
2051 } else { /* WRITE */
2052 if (!bh->b_blocknr) {
2053 if (!block)
2054 BUG();
2055 bh->b_blocknr = block;
2056 } else {
2057 if (!block)
2058 BUG();
2060 set_bit(BH_Uptodate, &bh->b_state);
2061 set_bit(BH_Dirty, &bh->b_state);
2062 arr[nr++] = bh;
2063 atomic_inc(&bh->b_count);
2065 bh = bh->b_this_page;
2066 } while (bh != head);
2067 if ((rw == READ) && nr) {
2068 if (Page_Uptodate(page))
2069 BUG();
2070 ll_rw_block(rw, nr, arr);
2071 } else {
2072 if (!nr && rw == READ) {
2073 SetPageUptodate(page);
2074 UnlockPage(page);
2076 if (nr && (rw == WRITE))
2077 ll_rw_block(rw, nr, arr);
2079 return 0;
2082 int block_symlink(struct inode *inode, const char *symname, int len)
2084 struct address_space *mapping = inode->i_mapping;
2085 struct page *page = grab_cache_page(mapping, 0);
2086 int err = -ENOMEM;
2087 char *kaddr;
2089 if (!page)
2090 goto fail;
2091 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2092 if (err)
2093 goto fail_map;
2094 kaddr = page_address(page);
2095 memcpy(kaddr, symname, len-1);
2096 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2098 * Notice that we are _not_ going to block here - end of page is
2099 * unmapped, so this will only try to map the rest of page, see
2100 * that it is unmapped (typically even will not look into inode -
2101 * ->i_size will be enough for everything) and zero it out.
2102 * OTOH it's obviously correct and should make the page up-to-date.
2104 err = mapping->a_ops->readpage(NULL, page);
2105 wait_on_page(page);
2106 page_cache_release(page);
2107 if (err < 0)
2108 goto fail;
2109 mark_inode_dirty(inode);
2110 return 0;
2111 fail_map:
2112 UnlockPage(page);
2113 page_cache_release(page);
2114 fail:
2115 return err;
2119 * Try to increase the number of buffers available: the size argument
2120 * is used to determine what kind of buffers we want.
2122 static int grow_buffers(int size)
2124 struct page * page;
2125 struct buffer_head *bh, *tmp;
2126 struct buffer_head * insert_point;
2127 int isize;
2129 if ((size & 511) || (size > PAGE_SIZE)) {
2130 printk("VFS: grow_buffers: size = %d\n",size);
2131 return 0;
2134 page = alloc_page(GFP_BUFFER);
2135 if (!page)
2136 goto out;
2137 bh = create_buffers(page, size, 0);
2138 if (!bh)
2139 goto no_buffer_head;
2141 isize = BUFSIZE_INDEX(size);
2143 spin_lock(&free_list[isize].lock);
2144 insert_point = free_list[isize].list;
2145 tmp = bh;
2146 while (1) {
2147 if (insert_point) {
2148 tmp->b_next_free = insert_point->b_next_free;
2149 tmp->b_prev_free = insert_point;
2150 insert_point->b_next_free->b_prev_free = tmp;
2151 insert_point->b_next_free = tmp;
2152 } else {
2153 tmp->b_prev_free = tmp;
2154 tmp->b_next_free = tmp;
2156 insert_point = tmp;
2157 if (tmp->b_this_page)
2158 tmp = tmp->b_this_page;
2159 else
2160 break;
2162 tmp->b_this_page = bh;
2163 free_list[isize].list = bh;
2164 spin_unlock(&free_list[isize].lock);
2166 page->buffers = bh;
2167 page->flags &= ~(1 << PG_referenced);
2168 lru_cache_add(page);
2169 atomic_inc(&buffermem_pages);
2170 return 1;
2172 no_buffer_head:
2173 page_cache_release(page);
2174 out:
2175 return 0;
2179 * Sync all the buffers on one page..
2181 * If we have old buffers that are locked, we'll
2182 * wait on them, but we won't wait on the new ones
2183 * we're writing out now.
2185 * This all is required so that we can free up memory
2186 * later.
2188 * Wait:
2189 * 0 - no wait (this does not get called - see try_to_free_buffers below)
2190 * 1 - start IO for dirty buffers
2191 * 2 - wait for completion of locked buffers
2193 static void sync_page_buffers(struct buffer_head *bh, int wait)
2195 struct buffer_head * tmp = bh;
2197 do {
2198 struct buffer_head *p = tmp;
2199 tmp = tmp->b_this_page;
2200 if (buffer_locked(p)) {
2201 if (wait > 1)
2202 __wait_on_buffer(p);
2203 } else if (buffer_dirty(p))
2204 ll_rw_block(WRITE, 1, &p);
2205 } while (tmp != bh);
2209 * Can the buffer be thrown out?
2211 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2212 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2215 * try_to_free_buffers() checks if all the buffers on this particular page
2216 * are unused, and free's the page if so.
2218 * Wake up bdflush() if this fails - if we're running low on memory due
2219 * to dirty buffers, we need to flush them out as quickly as possible.
2221 * NOTE: There are quite a number of ways that threads of control can
2222 * obtain a reference to a buffer head within a page. So we must
2223 * lock out all of these paths to cleanly toss the page.
2225 int try_to_free_buffers(struct page * page, int wait)
2227 struct buffer_head * tmp, * bh = page->buffers;
2228 int index = BUFSIZE_INDEX(bh->b_size);
2230 spin_lock(&lru_list_lock);
2231 write_lock(&hash_table_lock);
2232 spin_lock(&free_list[index].lock);
2233 tmp = bh;
2234 do {
2235 struct buffer_head *p = tmp;
2237 tmp = tmp->b_this_page;
2238 if (buffer_busy(p))
2239 goto busy_buffer_page;
2240 } while (tmp != bh);
2242 spin_lock(&unused_list_lock);
2243 tmp = bh;
2244 do {
2245 struct buffer_head * p = tmp;
2246 tmp = tmp->b_this_page;
2248 /* The buffer can be either on the regular
2249 * queues or on the free list..
2251 if (p->b_dev != B_FREE)
2252 __remove_from_queues(p);
2253 else
2254 __remove_from_free_list(p, index);
2255 __put_unused_buffer_head(p);
2256 } while (tmp != bh);
2257 spin_unlock(&unused_list_lock);
2259 /* Wake up anyone waiting for buffer heads */
2260 wake_up(&buffer_wait);
2262 /* And free the page */
2263 page->buffers = NULL;
2264 page_cache_release(page);
2265 spin_unlock(&free_list[index].lock);
2266 write_unlock(&hash_table_lock);
2267 spin_unlock(&lru_list_lock);
2268 return 1;
2270 busy_buffer_page:
2271 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2272 spin_unlock(&free_list[index].lock);
2273 write_unlock(&hash_table_lock);
2274 spin_unlock(&lru_list_lock);
2275 if (wait)
2276 sync_page_buffers(bh, wait);
2277 return 0;
2280 /* ================== Debugging =================== */
2282 void show_buffers(void)
2284 #ifdef CONFIG_SMP
2285 struct buffer_head * bh;
2286 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2287 int protected = 0;
2288 int nlist;
2289 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2290 #endif
2292 printk("Buffer memory: %6dkB\n",
2293 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2295 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2296 if (!spin_trylock(&lru_list_lock))
2297 return;
2298 for(nlist = 0; nlist < NR_LIST; nlist++) {
2299 found = locked = dirty = used = lastused = protected = 0;
2300 bh = lru_list[nlist];
2301 if(!bh) continue;
2303 do {
2304 found++;
2305 if (buffer_locked(bh))
2306 locked++;
2307 if (buffer_protected(bh))
2308 protected++;
2309 if (buffer_dirty(bh))
2310 dirty++;
2311 if (atomic_read(&bh->b_count))
2312 used++, lastused = found;
2313 bh = bh->b_next_free;
2314 } while (bh != lru_list[nlist]);
2316 int tmp = nr_buffers_type[nlist];
2317 if (found != tmp)
2318 printk("%9s: BUG -> found %d, reported %d\n",
2319 buf_types[nlist], found, tmp);
2321 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2322 "%d locked, %d protected, %d dirty\n",
2323 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2324 used, lastused, locked, protected, dirty);
2326 spin_unlock(&lru_list_lock);
2327 #endif
2330 /* ===================== Init ======================= */
2333 * allocate the hash table and init the free list
2334 * Use gfp() for the hash table to decrease TLB misses, use
2335 * SLAB cache for buffer heads.
2337 void __init buffer_init(unsigned long mempages)
2339 int order, i;
2340 unsigned int nr_hash;
2342 /* The buffer cache hash table is less important these days,
2343 * trim it a bit.
2345 mempages >>= 14;
2347 mempages *= sizeof(struct buffer_head *);
2349 for (order = 0; (1 << order) < mempages; order++)
2352 /* try to allocate something until we get it or we're asking
2353 for something that is really too small */
2355 do {
2356 unsigned long tmp;
2358 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2359 bh_hash_mask = (nr_hash - 1);
2361 tmp = nr_hash;
2362 bh_hash_shift = 0;
2363 while((tmp >>= 1UL) != 0UL)
2364 bh_hash_shift++;
2366 hash_table = (struct buffer_head **)
2367 __get_free_pages(GFP_ATOMIC, order);
2368 } while (hash_table == NULL && --order > 0);
2369 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2370 nr_hash, order, (PAGE_SIZE << order));
2372 if (!hash_table)
2373 panic("Failed to allocate buffer hash table\n");
2375 /* Setup hash chains. */
2376 for(i = 0; i < nr_hash; i++)
2377 hash_table[i] = NULL;
2379 /* Setup free lists. */
2380 for(i = 0; i < NR_SIZES; i++) {
2381 free_list[i].list = NULL;
2382 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2385 /* Setup lru lists. */
2386 for(i = 0; i < NR_LIST; i++)
2387 lru_list[i] = NULL;
2392 /* ====================== bdflush support =================== */
2394 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2395 * response to dirty buffers. Once this process is activated, we write back
2396 * a limited number of buffers to the disks and then go back to sleep again.
2398 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2399 struct task_struct *bdflush_tsk = 0;
2401 void wakeup_bdflush(int block)
2403 DECLARE_WAITQUEUE(wait, current);
2405 if (current == bdflush_tsk)
2406 return;
2408 if (!block) {
2409 wake_up_process(bdflush_tsk);
2410 return;
2413 /* kflushd can wakeup us before we have a chance to
2414 go to sleep so we must be smart in handling
2415 this wakeup event from kflushd to avoid deadlocking in SMP
2416 (we are not holding any lock anymore in these two paths). */
2417 __set_current_state(TASK_UNINTERRUPTIBLE);
2418 add_wait_queue(&bdflush_done, &wait);
2420 wake_up_process(bdflush_tsk);
2421 schedule();
2423 remove_wait_queue(&bdflush_done, &wait);
2424 __set_current_state(TASK_RUNNING);
2427 /* This is the _only_ function that deals with flushing async writes
2428 to disk.
2429 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2430 as all dirty buffers lives _only_ in the DIRTY lru list.
2431 As we never browse the LOCKED and CLEAN lru lists they are infact
2432 completly useless. */
2433 static int flush_dirty_buffers(int check_flushtime)
2435 struct buffer_head * bh, *next;
2436 int flushed = 0, i;
2438 restart:
2439 spin_lock(&lru_list_lock);
2440 bh = lru_list[BUF_DIRTY];
2441 if (!bh)
2442 goto out_unlock;
2443 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2444 next = bh->b_next_free;
2446 if (!buffer_dirty(bh)) {
2447 __refile_buffer(bh);
2448 continue;
2450 if (buffer_locked(bh))
2451 continue;
2453 if (check_flushtime) {
2454 /* The dirty lru list is chronologically ordered so
2455 if the current bh is not yet timed out,
2456 then also all the following bhs
2457 will be too young. */
2458 if (time_before(jiffies, bh->b_flushtime))
2459 goto out_unlock;
2460 } else {
2461 if (++flushed > bdf_prm.b_un.ndirty)
2462 goto out_unlock;
2465 /* OK, now we are committed to write it out. */
2466 atomic_inc(&bh->b_count);
2467 spin_unlock(&lru_list_lock);
2468 ll_rw_block(WRITE, 1, &bh);
2469 atomic_dec(&bh->b_count);
2471 if (current->need_resched)
2472 schedule();
2473 goto restart;
2475 out_unlock:
2476 spin_unlock(&lru_list_lock);
2478 return flushed;
2482 * Here we attempt to write back old buffers. We also try to flush inodes
2483 * and supers as well, since this function is essentially "update", and
2484 * otherwise there would be no way of ensuring that these quantities ever
2485 * get written back. Ideally, we would have a timestamp on the inodes
2486 * and superblocks so that we could write back only the old ones as well
2489 static int sync_old_buffers(void)
2491 lock_kernel();
2492 sync_supers(0);
2493 sync_inodes(0);
2494 unlock_kernel();
2496 flush_dirty_buffers(1);
2497 /* must really sync all the active I/O request to disk here */
2498 run_task_queue(&tq_disk);
2499 return 0;
2502 int block_sync_page(struct page *page)
2504 run_task_queue(&tq_disk);
2505 return 0;
2508 /* This is the interface to bdflush. As we get more sophisticated, we can
2509 * pass tuning parameters to this "process", to adjust how it behaves.
2510 * We would want to verify each parameter, however, to make sure that it
2511 * is reasonable. */
2513 asmlinkage long sys_bdflush(int func, long data)
2515 if (!capable(CAP_SYS_ADMIN))
2516 return -EPERM;
2518 if (func == 1) {
2519 /* do_exit directly and let kupdate to do its work alone. */
2520 do_exit(0);
2521 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2522 a syscall that doesn't care about the current mm context. */
2523 int error;
2524 struct mm_struct *user_mm;
2527 * bdflush will spend all of it's time in kernel-space,
2528 * without touching user-space, so we can switch it into
2529 * 'lazy TLB mode' to reduce the cost of context-switches
2530 * to and from bdflush.
2532 user_mm = start_lazy_tlb();
2533 error = sync_old_buffers();
2534 end_lazy_tlb(user_mm);
2535 return error;
2536 #endif
2539 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2540 if (func >= 2) {
2541 int i = (func-2) >> 1;
2542 if (i >= 0 && i < N_PARAM) {
2543 if ((func & 1) == 0)
2544 return put_user(bdf_prm.data[i], (int*)data);
2546 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2547 bdf_prm.data[i] = data;
2548 return 0;
2551 return -EINVAL;
2554 /* Having func 0 used to launch the actual bdflush and then never
2555 * return (unless explicitly killed). We return zero here to
2556 * remain semi-compatible with present update(8) programs.
2558 return 0;
2562 * This is the actual bdflush daemon itself. It used to be started from
2563 * the syscall above, but now we launch it ourselves internally with
2564 * kernel_thread(...) directly after the first thread in init/main.c
2566 int bdflush(void *sem)
2568 struct task_struct *tsk = current;
2569 int flushed;
2571 * We have a bare-bones task_struct, and really should fill
2572 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2573 * display semi-sane things. Not real crucial though...
2576 tsk->session = 1;
2577 tsk->pgrp = 1;
2578 strcpy(tsk->comm, "kflushd");
2579 bdflush_tsk = tsk;
2581 /* avoid getting signals */
2582 spin_lock_irq(&tsk->sigmask_lock);
2583 flush_signals(tsk);
2584 sigfillset(&tsk->blocked);
2585 recalc_sigpending(tsk);
2586 spin_unlock_irq(&tsk->sigmask_lock);
2588 up((struct semaphore *)sem);
2590 for (;;) {
2591 CHECK_EMERGENCY_SYNC
2593 flushed = flush_dirty_buffers(0);
2595 /* If wakeup_bdflush will wakeup us
2596 after our bdflush_done wakeup, then
2597 we must make sure to not sleep
2598 in schedule_timeout otherwise
2599 wakeup_bdflush may wait for our
2600 bdflush_done wakeup that would never arrive
2601 (as we would be sleeping) and so it would
2602 deadlock in SMP. */
2603 __set_current_state(TASK_INTERRUPTIBLE);
2604 wake_up(&bdflush_done);
2606 * If there are still a lot of dirty buffers around,
2607 * skip the sleep and flush some more. Otherwise, we
2608 * go to sleep waiting a wakeup.
2610 if (!flushed || balance_dirty_state(NODEV) < 0)
2611 schedule();
2612 /* Remember to mark us as running otherwise
2613 the next schedule will block. */
2614 __set_current_state(TASK_RUNNING);
2619 * This is the kernel update daemon. It was used to live in userspace
2620 * but since it's need to run safely we want it unkillable by mistake.
2621 * You don't need to change your userspace configuration since
2622 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2624 int kupdate(void *sem)
2626 struct task_struct * tsk = current;
2627 int interval;
2629 tsk->session = 1;
2630 tsk->pgrp = 1;
2631 strcpy(tsk->comm, "kupdate");
2633 /* sigstop and sigcont will stop and wakeup kupdate */
2634 spin_lock_irq(&tsk->sigmask_lock);
2635 sigfillset(&tsk->blocked);
2636 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2637 recalc_sigpending(tsk);
2638 spin_unlock_irq(&tsk->sigmask_lock);
2640 up((struct semaphore *)sem);
2642 for (;;) {
2643 /* update interval */
2644 interval = bdf_prm.b_un.interval;
2645 if (interval) {
2646 tsk->state = TASK_INTERRUPTIBLE;
2647 schedule_timeout(interval);
2648 } else {
2649 stop_kupdate:
2650 tsk->state = TASK_STOPPED;
2651 schedule(); /* wait for SIGCONT */
2653 /* check for sigstop */
2654 if (signal_pending(tsk)) {
2655 int stopped = 0;
2656 spin_lock_irq(&tsk->sigmask_lock);
2657 if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2658 sigdelset(&tsk->pending.signal, SIGSTOP);
2659 stopped = 1;
2661 recalc_sigpending(tsk);
2662 spin_unlock_irq(&tsk->sigmask_lock);
2663 if (stopped)
2664 goto stop_kupdate;
2666 #ifdef DEBUG
2667 printk("kupdate() activated...\n");
2668 #endif
2669 sync_old_buffers();
2673 static int __init bdflush_init(void)
2675 DECLARE_MUTEX_LOCKED(sem);
2676 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2677 down(&sem);
2678 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2679 down(&sem);
2680 return 0;
2683 module_init(bdflush_init)