Linux-2.3.3 and a short hiatus..
[davej-history.git] / fs / buffer.c
blobc55bac162b737832482ad171fd9515661ebcb1af
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 #include <linux/malloc.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/swap.h>
31 #include <linux/swapctl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/vmalloc.h>
34 #include <linux/blkdev.h>
35 #include <linux/sysrq.h>
36 #include <linux/file.h>
37 #include <linux/init.h>
38 #include <linux/quotaops.h>
40 #include <asm/uaccess.h>
41 #include <asm/io.h>
42 #include <asm/bitops.h>
44 #define NR_SIZES 7
45 static char buffersize_index[65] =
46 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
47 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
48 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
50 6};
52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
56 number of unused buffer heads */
59 * Hash table mask..
61 static unsigned long bh_hash_mask = 0;
63 static int grow_buffers(int size);
65 static struct buffer_head ** hash_table;
66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
69 static kmem_cache_t *bh_cachep;
71 static struct buffer_head * unused_list = NULL;
72 static struct buffer_head * reuse_list = NULL;
73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
75 static int nr_buffers = 0;
76 static int nr_buffers_type[NR_LIST] = {0,};
77 static int nr_buffer_heads = 0;
78 static int nr_unused_buffer_heads = 0;
79 static int nr_hashed_buffers = 0;
81 /* This is used by some architectures to estimate available memory. */
82 int buffermem = 0;
84 /* Here is the parameter block for the bdflush process. If you add or
85 * remove any of the parameters, make sure to update kernel/sysctl.c.
88 #define N_PARAM 9
90 /* The dummy values in this structure are left in there for compatibility
91 * with old programs that play with the /proc entries.
93 union bdflush_param{
94 struct {
95 int nfract; /* Percentage of buffer cache dirty to
96 activate bdflush */
97 int ndirty; /* Maximum number of dirty blocks to write out per
98 wake-cycle */
99 int nrefill; /* Number of clean buffers to try to obtain
100 each time we call refill */
101 int nref_dirt; /* Dirty buffer threshold for activating bdflush
102 when trying to refill buffers. */
103 int dummy1; /* unused */
104 int age_buffer; /* Time for normal buffer to age before
105 we flush it */
106 int age_super; /* Time for superblock to age before we
107 flush it */
108 int dummy2; /* unused */
109 int dummy3; /* unused */
110 } b_un;
111 unsigned int data[N_PARAM];
112 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
114 /* These are the min and max parameter values that we will allow to be assigned */
115 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
116 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
118 void wakeup_bdflush(int);
121 * Rewrote the wait-routines to use the "new" wait-queue functionality,
122 * and getting rid of the cli-sti pairs. The wait-queue routines still
123 * need cli-sti, but now it's just a couple of 386 instructions or so.
125 * Note that the real wait_on_buffer() is an inline function that checks
126 * if 'b_wait' is set before calling this, so that the queues aren't set
127 * up unnecessarily.
129 void __wait_on_buffer(struct buffer_head * bh)
131 struct task_struct *tsk = current;
132 DECLARE_WAITQUEUE(wait, tsk);
134 bh->b_count++;
135 add_wait_queue(&bh->b_wait, &wait);
136 repeat:
137 tsk->state = TASK_UNINTERRUPTIBLE;
138 run_task_queue(&tq_disk);
139 if (buffer_locked(bh)) {
140 schedule();
141 goto repeat;
143 tsk->state = TASK_RUNNING;
144 remove_wait_queue(&bh->b_wait, &wait);
145 bh->b_count--;
148 /* Call sync_buffers with wait!=0 to ensure that the call does not
149 * return until all buffer writes have completed. Sync() may return
150 * before the writes have finished; fsync() may not.
153 /* Godamity-damn. Some buffers (bitmaps for filesystems)
154 * spontaneously dirty themselves without ever brelse being called.
155 * We will ultimately want to put these in a separate list, but for
156 * now we search all of the lists for dirty buffers.
158 static int sync_buffers(kdev_t dev, int wait)
160 int i, retry, pass = 0, err = 0;
161 struct buffer_head * bh, *next;
163 /* One pass for no-wait, three for wait:
164 * 0) write out all dirty, unlocked buffers;
165 * 1) write out all dirty buffers, waiting if locked;
166 * 2) wait for completion by waiting for all buffers to unlock.
168 do {
169 retry = 0;
170 repeat:
171 /* We search all lists as a failsafe mechanism, not because we expect
172 * there to be dirty buffers on any of the other lists.
174 bh = lru_list[BUF_DIRTY];
175 if (!bh)
176 goto repeat2;
177 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
178 if (bh->b_list != BUF_DIRTY)
179 goto repeat;
180 next = bh->b_next_free;
181 if (!lru_list[BUF_DIRTY])
182 break;
183 if (dev && bh->b_dev != dev)
184 continue;
185 if (buffer_locked(bh)) {
186 /* Buffer is locked; skip it unless wait is
187 * requested AND pass > 0.
189 if (!wait || !pass) {
190 retry = 1;
191 continue;
193 wait_on_buffer (bh);
194 goto repeat;
197 /* If an unlocked buffer is not uptodate, there has
198 * been an IO error. Skip it.
200 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
201 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
202 err = -EIO;
203 continue;
206 /* Don't write clean buffers. Don't write ANY buffers
207 * on the third pass.
209 if (!buffer_dirty(bh) || pass >= 2)
210 continue;
212 /* Don't bother about locked buffers.
214 * XXX We checked if it was locked above and there is no
215 * XXX way we could have slept in between. -DaveM
217 if (buffer_locked(bh))
218 continue;
219 bh->b_count++;
220 next->b_count++;
221 bh->b_flushtime = 0;
222 ll_rw_block(WRITE, 1, &bh);
223 bh->b_count--;
224 next->b_count--;
225 retry = 1;
228 repeat2:
229 bh = lru_list[BUF_LOCKED];
230 if (!bh)
231 break;
232 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
233 if (bh->b_list != BUF_LOCKED)
234 goto repeat2;
235 next = bh->b_next_free;
236 if (!lru_list[BUF_LOCKED])
237 break;
238 if (dev && bh->b_dev != dev)
239 continue;
240 if (buffer_locked(bh)) {
241 /* Buffer is locked; skip it unless wait is
242 * requested AND pass > 0.
244 if (!wait || !pass) {
245 retry = 1;
246 continue;
248 wait_on_buffer (bh);
249 goto repeat2;
253 /* If we are waiting for the sync to succeed, and if any dirty
254 * blocks were written, then repeat; on the second pass, only
255 * wait for buffers being written (do not pass to write any
256 * more buffers on the second pass).
258 } while (wait && retry && ++pass<=2);
259 return err;
262 void sync_dev(kdev_t dev)
264 sync_buffers(dev, 0);
265 sync_supers(dev);
266 sync_inodes(dev);
267 sync_buffers(dev, 0);
268 DQUOT_SYNC(dev);
270 * FIXME(eric) we need to sync the physical devices here.
271 * This is because some (scsi) controllers have huge amounts of
272 * cache onboard (hundreds of Mb), and we need to instruct
273 * them to commit all of the dirty memory to disk, and we should
274 * not return until this has happened.
276 * This would need to get implemented by going through the assorted
277 * layers so that each block major number can be synced, and this
278 * would call down into the upper and mid-layer scsi.
282 int fsync_dev(kdev_t dev)
284 sync_buffers(dev, 0);
285 sync_supers(dev);
286 sync_inodes(dev);
287 DQUOT_SYNC(dev);
288 return sync_buffers(dev, 1);
291 asmlinkage int sys_sync(void)
293 lock_kernel();
294 fsync_dev(0);
295 unlock_kernel();
296 return 0;
300 * filp may be NULL if called via the msync of a vma.
303 int file_fsync(struct file *filp, struct dentry *dentry)
305 struct inode * inode = dentry->d_inode;
306 struct super_block * sb;
307 kdev_t dev;
309 /* sync the inode to buffers */
310 write_inode_now(inode);
312 /* sync the superblock to buffers */
313 sb = inode->i_sb;
314 wait_on_super(sb);
315 if (sb->s_op && sb->s_op->write_super)
316 sb->s_op->write_super(sb);
318 /* .. finally sync the buffers to disk */
319 dev = inode->i_dev;
320 return sync_buffers(dev, 1);
323 asmlinkage int sys_fsync(unsigned int fd)
325 struct file * file;
326 struct dentry * dentry;
327 struct inode * inode;
328 int err;
330 lock_kernel();
331 err = -EBADF;
332 file = fget(fd);
333 if (!file)
334 goto out;
336 dentry = file->f_dentry;
337 if (!dentry)
338 goto out_putf;
340 inode = dentry->d_inode;
341 if (!inode)
342 goto out_putf;
344 err = -EINVAL;
345 if (!file->f_op || !file->f_op->fsync)
346 goto out_putf;
348 /* We need to protect against concurrent writers.. */
349 down(&inode->i_sem);
350 err = file->f_op->fsync(file, dentry);
351 up(&inode->i_sem);
353 out_putf:
354 fput(file);
355 out:
356 unlock_kernel();
357 return err;
360 asmlinkage int sys_fdatasync(unsigned int fd)
362 struct file * file;
363 struct dentry * dentry;
364 struct inode * inode;
365 int err;
367 lock_kernel();
368 err = -EBADF;
369 file = fget(fd);
370 if (!file)
371 goto out;
373 dentry = file->f_dentry;
374 if (!dentry)
375 goto out_putf;
377 inode = dentry->d_inode;
378 if (!inode)
379 goto out_putf;
381 err = -EINVAL;
382 if (!file->f_op || !file->f_op->fsync)
383 goto out_putf;
385 /* this needs further work, at the moment it is identical to fsync() */
386 down(&inode->i_sem);
387 err = file->f_op->fsync(file, dentry);
388 up(&inode->i_sem);
390 out_putf:
391 fput(file);
392 out:
393 unlock_kernel();
394 return err;
397 void invalidate_buffers(kdev_t dev)
399 int i;
400 int nlist;
401 struct buffer_head * bh;
403 for(nlist = 0; nlist < NR_LIST; nlist++) {
404 bh = lru_list[nlist];
405 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
406 if (bh->b_dev != dev)
407 continue;
408 wait_on_buffer(bh);
409 if (bh->b_dev != dev)
410 continue;
411 if (bh->b_count)
412 continue;
413 bh->b_flushtime = 0;
414 clear_bit(BH_Protected, &bh->b_state);
415 clear_bit(BH_Uptodate, &bh->b_state);
416 clear_bit(BH_Dirty, &bh->b_state);
417 clear_bit(BH_Req, &bh->b_state);
422 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
423 #define hash(dev,block) hash_table[_hashfn(dev,block)]
425 static inline void remove_from_hash_queue(struct buffer_head * bh)
427 struct buffer_head **pprev = bh->b_pprev;
428 if (pprev) {
429 struct buffer_head * next = bh->b_next;
430 if (next) {
431 next->b_pprev = pprev;
432 bh->b_next = NULL;
434 *pprev = next;
435 bh->b_pprev = NULL;
437 nr_hashed_buffers--;
440 static inline void remove_from_lru_list(struct buffer_head * bh)
442 if (!(bh->b_prev_free) || !(bh->b_next_free))
443 panic("VFS: LRU block list corrupted");
444 if (bh->b_dev == B_FREE)
445 panic("LRU list corrupted");
446 bh->b_prev_free->b_next_free = bh->b_next_free;
447 bh->b_next_free->b_prev_free = bh->b_prev_free;
449 if (lru_list[bh->b_list] == bh)
450 lru_list[bh->b_list] = bh->b_next_free;
451 if (lru_list[bh->b_list] == bh)
452 lru_list[bh->b_list] = NULL;
453 bh->b_next_free = bh->b_prev_free = NULL;
456 static inline void remove_from_free_list(struct buffer_head * bh)
458 int isize = BUFSIZE_INDEX(bh->b_size);
459 if (!(bh->b_prev_free) || !(bh->b_next_free))
460 panic("VFS: Free block list corrupted");
461 if(bh->b_dev != B_FREE)
462 panic("Free list corrupted");
463 if(!free_list[isize])
464 panic("Free list empty");
465 if(bh->b_next_free == bh)
466 free_list[isize] = NULL;
467 else {
468 bh->b_prev_free->b_next_free = bh->b_next_free;
469 bh->b_next_free->b_prev_free = bh->b_prev_free;
470 if (free_list[isize] == bh)
471 free_list[isize] = bh->b_next_free;
473 bh->b_next_free = bh->b_prev_free = NULL;
476 static void remove_from_queues(struct buffer_head * bh)
478 if(bh->b_dev == B_FREE) {
479 remove_from_free_list(bh); /* Free list entries should not be
480 in the hash queue */
481 return;
483 nr_buffers_type[bh->b_list]--;
484 remove_from_hash_queue(bh);
485 remove_from_lru_list(bh);
488 static inline void put_last_free(struct buffer_head * bh)
490 if (bh) {
491 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
493 bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
495 /* Add to back of free list. */
496 if(!*bhp) {
497 *bhp = bh;
498 bh->b_prev_free = bh;
501 bh->b_next_free = *bhp;
502 bh->b_prev_free = (*bhp)->b_prev_free;
503 (*bhp)->b_prev_free->b_next_free = bh;
504 (*bhp)->b_prev_free = bh;
508 static void insert_into_queues(struct buffer_head * bh)
510 /* put at end of free list */
511 if(bh->b_dev == B_FREE) {
512 put_last_free(bh);
513 } else {
514 struct buffer_head **bhp = &lru_list[bh->b_list];
516 if(!*bhp) {
517 *bhp = bh;
518 bh->b_prev_free = bh;
521 if (bh->b_next_free)
522 panic("VFS: buffer LRU pointers corrupted");
524 bh->b_next_free = *bhp;
525 bh->b_prev_free = (*bhp)->b_prev_free;
526 (*bhp)->b_prev_free->b_next_free = bh;
527 (*bhp)->b_prev_free = bh;
529 nr_buffers_type[bh->b_list]++;
531 /* Put the buffer in new hash-queue if it has a device. */
532 bh->b_next = NULL;
533 bh->b_pprev = NULL;
534 if (bh->b_dev) {
535 struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
536 struct buffer_head *next = *bhp;
538 if (next) {
539 bh->b_next = next;
540 next->b_pprev = &bh->b_next;
542 *bhp = bh;
543 bh->b_pprev = bhp;
545 nr_hashed_buffers++;
549 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
551 struct buffer_head * next;
553 next = hash(dev,block);
554 for (;;) {
555 struct buffer_head *tmp = next;
556 if (!next)
557 break;
558 next = tmp->b_next;
559 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
560 continue;
561 next = tmp;
562 break;
564 return next;
568 * Why like this, I hear you say... The reason is race-conditions.
569 * As we don't lock buffers (unless we are reading them, that is),
570 * something might happen to it while we sleep (ie a read-error
571 * will force it bad). This shouldn't really happen currently, but
572 * the code is ready.
574 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
576 struct buffer_head * bh;
577 bh = find_buffer(dev,block,size);
578 if (bh)
579 bh->b_count++;
580 return bh;
583 unsigned int get_hardblocksize(kdev_t dev)
586 * Get the hard sector size for the given device. If we don't know
587 * what it is, return 0.
589 if (hardsect_size[MAJOR(dev)] != NULL) {
590 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
591 if (blksize != 0)
592 return blksize;
596 * We don't know what the hardware sector size for this device is.
597 * Return 0 indicating that we don't know.
599 return 0;
602 void set_blocksize(kdev_t dev, int size)
604 extern int *blksize_size[];
605 int i, nlist;
606 struct buffer_head * bh, *bhnext;
608 if (!blksize_size[MAJOR(dev)])
609 return;
611 /* Size must be a power of two, and between 512 and PAGE_SIZE */
612 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
613 panic("Invalid blocksize passed to set_blocksize");
615 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
616 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
617 return;
619 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
620 return;
621 sync_buffers(dev, 2);
622 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
624 /* We need to be quite careful how we do this - we are moving entries
625 * around on the free list, and we can get in a loop if we are not careful.
627 for(nlist = 0; nlist < NR_LIST; nlist++) {
628 bh = lru_list[nlist];
629 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
630 if(!bh)
631 break;
633 bhnext = bh->b_next_free;
634 if (bh->b_dev != dev)
635 continue;
636 if (bh->b_size == size)
637 continue;
638 bhnext->b_count++;
639 wait_on_buffer(bh);
640 bhnext->b_count--;
641 if (bh->b_dev == dev && bh->b_size != size) {
642 clear_bit(BH_Dirty, &bh->b_state);
643 clear_bit(BH_Uptodate, &bh->b_state);
644 clear_bit(BH_Req, &bh->b_state);
645 bh->b_flushtime = 0;
647 remove_from_queues(bh);
648 bh->b_dev=B_FREE;
649 insert_into_queues(bh);
655 * We used to try various strange things. Let's not.
657 static void refill_freelist(int size)
659 if (!grow_buffers(size)) {
660 wakeup_bdflush(1);
661 current->policy |= SCHED_YIELD;
662 schedule();
666 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
667 bh_end_io_t *handler, void *dev_id)
669 bh->b_count = 1;
670 bh->b_list = BUF_CLEAN;
671 bh->b_flushtime = 0;
672 bh->b_dev = dev;
673 bh->b_blocknr = block;
674 bh->b_end_io = handler;
675 bh->b_dev_id = dev_id;
678 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
680 mark_buffer_uptodate(bh, uptodate);
681 unlock_buffer(bh);
685 * Ok, this is getblk, and it isn't very clear, again to hinder
686 * race-conditions. Most of the code is seldom used, (ie repeating),
687 * so it should be much more efficient than it looks.
689 * The algorithm is changed: hopefully better, and an elusive bug removed.
691 * 14.02.92: changed it to sync dirty buffers a bit: better performance
692 * when the filesystem starts to get full of dirty blocks (I hope).
694 struct buffer_head * getblk(kdev_t dev, int block, int size)
696 struct buffer_head * bh;
697 int isize;
699 repeat:
700 bh = get_hash_table(dev, block, size);
701 if (bh) {
702 if (!buffer_dirty(bh)) {
703 bh->b_flushtime = 0;
705 return bh;
708 isize = BUFSIZE_INDEX(size);
709 get_free:
710 bh = free_list[isize];
711 if (!bh)
712 goto refill;
713 remove_from_free_list(bh);
715 /* OK, FINALLY we know that this buffer is the only one of its kind,
716 * and that it's unused (b_count=0), unlocked, and clean.
718 init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
719 bh->b_state=0;
720 insert_into_queues(bh);
721 return bh;
724 * If we block while refilling the free list, somebody may
725 * create the buffer first ... search the hashes again.
727 refill:
728 refill_freelist(size);
729 if (!find_buffer(dev,block,size))
730 goto get_free;
731 goto repeat;
734 void set_writetime(struct buffer_head * buf, int flag)
736 int newtime;
738 if (buffer_dirty(buf)) {
739 /* Move buffer to dirty list if jiffies is clear. */
740 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
741 bdf_prm.b_un.age_buffer);
742 if(!buf->b_flushtime || buf->b_flushtime > newtime)
743 buf->b_flushtime = newtime;
744 } else {
745 buf->b_flushtime = 0;
751 * Put a buffer into the appropriate list, without side-effects.
753 static inline void file_buffer(struct buffer_head *bh, int list)
755 remove_from_queues(bh);
756 bh->b_list = list;
757 insert_into_queues(bh);
761 * A buffer may need to be moved from one buffer list to another
762 * (e.g. in case it is not shared any more). Handle this.
764 void refile_buffer(struct buffer_head * buf)
766 int dispose;
768 if(buf->b_dev == B_FREE) {
769 printk("Attempt to refile free buffer\n");
770 return;
772 if (buffer_dirty(buf))
773 dispose = BUF_DIRTY;
774 else if (buffer_locked(buf))
775 dispose = BUF_LOCKED;
776 else
777 dispose = BUF_CLEAN;
778 if(dispose != buf->b_list) {
779 file_buffer(buf, dispose);
780 if(dispose == BUF_DIRTY) {
781 int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
783 /* This buffer is dirty, maybe we need to start flushing.
784 * If too high a percentage of the buffers are dirty...
786 if (nr_buffers_type[BUF_DIRTY] > too_many)
787 wakeup_bdflush(1);
789 /* If this is a loop device, and
790 * more than half of the buffers are dirty...
791 * (Prevents no-free-buffers deadlock with loop device.)
793 if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
794 nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
795 wakeup_bdflush(1);
801 * Release a buffer head
803 void __brelse(struct buffer_head * buf)
805 /* If dirty, mark the time this buffer should be written back. */
806 set_writetime(buf, 0);
807 refile_buffer(buf);
808 touch_buffer(buf);
810 if (buf->b_count) {
811 buf->b_count--;
812 return;
814 printk("VFS: brelse: Trying to free free buffer\n");
818 * bforget() is like brelse(), except it puts the buffer on the
819 * free list if it can.. We can NOT free the buffer if:
820 * - there are other users of it
821 * - it is locked and thus can have active IO
823 void __bforget(struct buffer_head * buf)
825 if (buf->b_count != 1 || buffer_locked(buf)) {
826 __brelse(buf);
827 return;
829 buf->b_count = 0;
830 buf->b_state = 0;
831 remove_from_queues(buf);
832 put_last_free(buf);
836 * bread() reads a specified block and returns the buffer that contains
837 * it. It returns NULL if the block was unreadable.
839 struct buffer_head * bread(kdev_t dev, int block, int size)
841 struct buffer_head * bh;
843 bh = getblk(dev, block, size);
844 if (buffer_uptodate(bh))
845 return bh;
846 ll_rw_block(READ, 1, &bh);
847 wait_on_buffer(bh);
848 if (buffer_uptodate(bh))
849 return bh;
850 brelse(bh);
851 return NULL;
855 * Ok, breada can be used as bread, but additionally to mark other
856 * blocks for reading as well. End the argument list with a negative
857 * number.
860 #define NBUF 16
862 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
863 unsigned int pos, unsigned int filesize)
865 struct buffer_head * bhlist[NBUF];
866 unsigned int blocks;
867 struct buffer_head * bh;
868 int index;
869 int i, j;
871 if (pos >= filesize)
872 return NULL;
874 if (block < 0)
875 return NULL;
877 bh = getblk(dev, block, bufsize);
878 index = BUFSIZE_INDEX(bh->b_size);
880 if (buffer_uptodate(bh))
881 return(bh);
882 else ll_rw_block(READ, 1, &bh);
884 blocks = (filesize - pos) >> (9+index);
886 if (blocks < (read_ahead[MAJOR(dev)] >> index))
887 blocks = read_ahead[MAJOR(dev)] >> index;
888 if (blocks > NBUF)
889 blocks = NBUF;
891 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
894 bhlist[0] = bh;
895 j = 1;
896 for(i=1; i<blocks; i++) {
897 bh = getblk(dev,block+i,bufsize);
898 if (buffer_uptodate(bh)) {
899 brelse(bh);
900 break;
902 else bhlist[j++] = bh;
905 /* Request the read for these buffers, and then release them. */
906 if (j>1)
907 ll_rw_block(READA, (j-1), bhlist+1);
908 for(i=1; i<j; i++)
909 brelse(bhlist[i]);
911 /* Wait for this buffer, and then continue on. */
912 bh = bhlist[0];
913 wait_on_buffer(bh);
914 if (buffer_uptodate(bh))
915 return bh;
916 brelse(bh);
917 return NULL;
921 * Note: the caller should wake up the buffer_wait list if needed.
923 static void put_unused_buffer_head(struct buffer_head * bh)
925 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
926 nr_buffer_heads--;
927 kmem_cache_free(bh_cachep, bh);
928 return;
931 memset(bh,0,sizeof(*bh));
932 init_waitqueue_head(&bh->b_wait);
933 nr_unused_buffer_heads++;
934 bh->b_next_free = unused_list;
935 unused_list = bh;
939 * We can't put completed temporary IO buffer_heads directly onto the
940 * unused_list when they become unlocked, since the device driver
941 * end_request routines still expect access to the buffer_head's
942 * fields after the final unlock. So, the device driver puts them on
943 * the reuse_list instead once IO completes, and we recover these to
944 * the unused_list here.
946 * Note that we don't do a wakeup here, but return a flag indicating
947 * whether we got any buffer heads. A task ready to sleep can check
948 * the returned value, and any tasks already sleeping will have been
949 * awakened when the buffer heads were added to the reuse list.
951 static inline int recover_reusable_buffer_heads(void)
953 struct buffer_head *head = xchg(&reuse_list, NULL);
954 int found = 0;
956 if (head) {
957 do {
958 struct buffer_head *bh = head;
959 head = head->b_next_free;
960 put_unused_buffer_head(bh);
961 } while (head);
962 found = 1;
964 return found;
968 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
969 * no-buffer-head deadlock. Return NULL on failure; waiting for
970 * buffer heads is now handled in create_buffers().
972 static struct buffer_head * get_unused_buffer_head(int async)
974 struct buffer_head * bh;
976 recover_reusable_buffer_heads();
977 if (nr_unused_buffer_heads > NR_RESERVED) {
978 bh = unused_list;
979 unused_list = bh->b_next_free;
980 nr_unused_buffer_heads--;
981 return bh;
984 /* This is critical. We can't swap out pages to get
985 * more buffer heads, because the swap-out may need
986 * more buffer-heads itself. Thus SLAB_BUFFER.
988 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
989 memset(bh, 0, sizeof(*bh));
990 init_waitqueue_head(&bh->b_wait);
991 nr_buffer_heads++;
992 return bh;
996 * If we need an async buffer, use the reserved buffer heads.
998 if (async && unused_list) {
999 bh = unused_list;
1000 unused_list = bh->b_next_free;
1001 nr_unused_buffer_heads--;
1002 return bh;
1005 #if 0
1007 * (Pending further analysis ...)
1008 * Ordinary (non-async) requests can use a different memory priority
1009 * to free up pages. Any swapping thus generated will use async
1010 * buffer heads.
1012 if(!async &&
1013 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1014 memset(bh, 0, sizeof(*bh));
1015 init_waitqueue_head(&bh->b_wait);
1016 nr_buffer_heads++;
1017 return bh;
1019 #endif
1021 return NULL;
1025 * Create the appropriate buffers when given a page for data area and
1026 * the size of each buffer.. Use the bh->b_this_page linked list to
1027 * follow the buffers created. Return NULL if unable to create more
1028 * buffers.
1029 * The async flag is used to differentiate async IO (paging, swapping)
1030 * from ordinary buffer allocations, and only async requests are allowed
1031 * to sleep waiting for buffer heads.
1033 static struct buffer_head * create_buffers(unsigned long page,
1034 unsigned long size, int async)
1036 DECLARE_WAITQUEUE(wait, current);
1037 struct buffer_head *bh, *head;
1038 long offset;
1040 try_again:
1041 head = NULL;
1042 offset = PAGE_SIZE;
1043 while ((offset -= size) >= 0) {
1044 bh = get_unused_buffer_head(async);
1045 if (!bh)
1046 goto no_grow;
1048 bh->b_dev = B_FREE; /* Flag as unused */
1049 bh->b_this_page = head;
1050 head = bh;
1052 bh->b_state = 0;
1053 bh->b_next_free = NULL;
1054 bh->b_count = 0;
1055 bh->b_size = size;
1057 bh->b_data = (char *) (page+offset);
1058 bh->b_list = 0;
1060 return head;
1062 * In case anything failed, we just free everything we got.
1064 no_grow:
1065 if (head) {
1066 do {
1067 bh = head;
1068 head = head->b_this_page;
1069 put_unused_buffer_head(bh);
1070 } while (head);
1072 /* Wake up any waiters ... */
1073 wake_up(&buffer_wait);
1077 * Return failure for non-async IO requests. Async IO requests
1078 * are not allowed to fail, so we have to wait until buffer heads
1079 * become available. But we don't want tasks sleeping with
1080 * partially complete buffers, so all were released above.
1082 if (!async)
1083 return NULL;
1085 /* We're _really_ low on memory. Now we just
1086 * wait for old buffer heads to become free due to
1087 * finishing IO. Since this is an async request and
1088 * the reserve list is empty, we're sure there are
1089 * async buffer heads in use.
1091 run_task_queue(&tq_disk);
1094 * Set our state for sleeping, then check again for buffer heads.
1095 * This ensures we won't miss a wake_up from an interrupt.
1097 add_wait_queue(&buffer_wait, &wait);
1098 current->state = TASK_UNINTERRUPTIBLE;
1099 if (!recover_reusable_buffer_heads())
1100 schedule();
1101 remove_wait_queue(&buffer_wait, &wait);
1102 current->state = TASK_RUNNING;
1103 goto try_again;
1106 /* Run the hooks that have to be done when a page I/O has completed. */
1107 static inline void after_unlock_page (struct page * page)
1109 if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1110 atomic_dec(&nr_async_pages);
1111 #ifdef DEBUG_SWAP
1112 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1113 (char *) page_address(page),
1114 atomic_read(&nr_async_pages));
1115 #endif
1117 if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1118 swap_after_unlock_page(page->offset);
1119 if (test_and_clear_bit(PG_free_after, &page->flags))
1120 __free_page(page);
1124 * Free all temporary buffers belonging to a page.
1125 * This needs to be called with interrupts disabled.
1127 static inline void free_async_buffers (struct buffer_head * bh)
1129 struct buffer_head *tmp, *tail;
1132 * Link all the buffers into the b_next_free list,
1133 * so we only have to do one xchg() operation ...
1135 tail = bh;
1136 while ((tmp = tail->b_this_page) != bh) {
1137 tail->b_next_free = tmp;
1138 tail = tmp;
1141 /* Update the reuse list */
1142 tail->b_next_free = xchg(&reuse_list, NULL);
1143 reuse_list = bh;
1145 /* Wake up any waiters ... */
1146 wake_up(&buffer_wait);
1149 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1151 unsigned long flags;
1152 struct buffer_head *tmp;
1153 struct page *page;
1155 mark_buffer_uptodate(bh, uptodate);
1156 unlock_buffer(bh);
1158 /* This is a temporary buffer used for page I/O. */
1159 page = mem_map + MAP_NR(bh->b_data);
1160 if (!PageLocked(page))
1161 goto not_locked;
1162 if (bh->b_count != 1)
1163 goto bad_count;
1165 if (!test_bit(BH_Uptodate, &bh->b_state))
1166 set_bit(PG_error, &page->flags);
1169 * Be _very_ careful from here on. Bad things can happen if
1170 * two buffer heads end IO at almost the same time and both
1171 * decide that the page is now completely done.
1173 * Async buffer_heads are here only as labels for IO, and get
1174 * thrown away once the IO for this page is complete. IO is
1175 * deemed complete once all buffers have been visited
1176 * (b_count==0) and are now unlocked. We must make sure that
1177 * only the _last_ buffer that decrements its count is the one
1178 * that free's the page..
1180 save_flags(flags);
1181 cli();
1182 bh->b_count--;
1183 tmp = bh;
1184 do {
1185 if (tmp->b_count)
1186 goto still_busy;
1187 tmp = tmp->b_this_page;
1188 } while (tmp != bh);
1190 /* OK, the async IO on this page is complete. */
1191 free_async_buffers(bh);
1192 restore_flags(flags);
1193 clear_bit(PG_locked, &page->flags);
1194 wake_up(&page->wait);
1195 after_unlock_page(page);
1196 return;
1198 still_busy:
1199 restore_flags(flags);
1200 return;
1202 not_locked:
1203 printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1204 return;
1206 bad_count:
1207 printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1208 return;
1212 * Start I/O on a page.
1213 * This function expects the page to be locked and may return before I/O is complete.
1214 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1216 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1218 struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1219 int block, nr;
1221 if (!PageLocked(page))
1222 panic("brw_page: page not locked for I/O");
1223 clear_bit(PG_uptodate, &page->flags);
1224 clear_bit(PG_error, &page->flags);
1226 * Allocate async buffer heads pointing to this page, just for I/O.
1227 * They do _not_ show up in the buffer hash table!
1228 * They are _not_ registered in page->buffers either!
1230 bh = create_buffers(page_address(page), size, 1);
1231 if (!bh) {
1232 /* WSH: exit here leaves page->count incremented */
1233 clear_bit(PG_locked, &page->flags);
1234 wake_up(&page->wait);
1235 return -ENOMEM;
1237 nr = 0;
1238 next = bh;
1239 do {
1240 struct buffer_head * tmp;
1241 block = *(b++);
1243 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1244 set_bit(BH_Uptodate, &next->b_state);
1247 * When we use bmap, we define block zero to represent
1248 * a hole. ll_rw_page, however, may legitimately
1249 * access block zero, and we need to distinguish the
1250 * two cases.
1252 if (bmap && !block) {
1253 memset(next->b_data, 0, size);
1254 next->b_count--;
1255 continue;
1257 tmp = get_hash_table(dev, block, size);
1258 if (tmp) {
1259 if (!buffer_uptodate(tmp)) {
1260 if (rw == READ)
1261 ll_rw_block(READ, 1, &tmp);
1262 wait_on_buffer(tmp);
1264 if (rw == READ)
1265 memcpy(next->b_data, tmp->b_data, size);
1266 else {
1267 memcpy(tmp->b_data, next->b_data, size);
1268 mark_buffer_dirty(tmp, 0);
1270 brelse(tmp);
1271 next->b_count--;
1272 continue;
1274 if (rw == READ)
1275 clear_bit(BH_Uptodate, &next->b_state);
1276 else
1277 set_bit(BH_Dirty, &next->b_state);
1278 arr[nr++] = next;
1279 } while (prev = next, (next = next->b_this_page) != NULL);
1280 prev->b_this_page = bh;
1282 if (nr) {
1283 ll_rw_block(rw, nr, arr);
1284 /* The rest of the work is done in mark_buffer_uptodate()
1285 * and unlock_buffer(). */
1286 } else {
1287 unsigned long flags;
1288 clear_bit(PG_locked, &page->flags);
1289 set_bit(PG_uptodate, &page->flags);
1290 wake_up(&page->wait);
1291 save_flags(flags);
1292 cli();
1293 free_async_buffers(bh);
1294 restore_flags(flags);
1295 after_unlock_page(page);
1297 ++current->maj_flt;
1298 return 0;
1302 * This is called by end_request() when I/O has completed.
1304 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1306 if (on) {
1307 struct buffer_head *tmp = bh;
1308 set_bit(BH_Uptodate, &bh->b_state);
1309 /* If a page has buffers and all these buffers are uptodate,
1310 * then the page is uptodate. */
1311 do {
1312 if (!test_bit(BH_Uptodate, &tmp->b_state))
1313 return;
1314 tmp=tmp->b_this_page;
1315 } while (tmp && tmp != bh);
1316 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1317 return;
1319 clear_bit(BH_Uptodate, &bh->b_state);
1323 * Generic "readpage" function for block devices that have the normal
1324 * bmap functionality. This is most of the block device filesystems.
1325 * Reads the page asynchronously --- the unlock_buffer() and
1326 * mark_buffer_uptodate() functions propagate buffer state into the
1327 * page struct once IO has completed.
1329 int generic_readpage(struct file * file, struct page * page)
1331 struct dentry *dentry = file->f_dentry;
1332 struct inode *inode = dentry->d_inode;
1333 unsigned long block;
1334 int *p, nr[PAGE_SIZE/512];
1335 int i;
1337 atomic_inc(&page->count);
1338 set_bit(PG_locked, &page->flags);
1339 set_bit(PG_free_after, &page->flags);
1341 i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1342 block = page->offset >> inode->i_sb->s_blocksize_bits;
1343 p = nr;
1344 do {
1345 *p = inode->i_op->bmap(inode, block);
1346 i--;
1347 block++;
1348 p++;
1349 } while (i > 0);
1351 /* IO start */
1352 brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1353 return 0;
1357 * Try to increase the number of buffers available: the size argument
1358 * is used to determine what kind of buffers we want.
1360 static int grow_buffers(int size)
1362 unsigned long page;
1363 struct buffer_head *bh, *tmp;
1364 struct buffer_head * insert_point;
1365 int isize;
1367 if ((size & 511) || (size > PAGE_SIZE)) {
1368 printk("VFS: grow_buffers: size = %d\n",size);
1369 return 0;
1372 if (!(page = __get_free_page(GFP_BUFFER)))
1373 return 0;
1374 bh = create_buffers(page, size, 0);
1375 if (!bh) {
1376 free_page(page);
1377 return 0;
1380 isize = BUFSIZE_INDEX(size);
1381 insert_point = free_list[isize];
1383 tmp = bh;
1384 while (1) {
1385 if (insert_point) {
1386 tmp->b_next_free = insert_point->b_next_free;
1387 tmp->b_prev_free = insert_point;
1388 insert_point->b_next_free->b_prev_free = tmp;
1389 insert_point->b_next_free = tmp;
1390 } else {
1391 tmp->b_prev_free = tmp;
1392 tmp->b_next_free = tmp;
1394 insert_point = tmp;
1395 ++nr_buffers;
1396 if (tmp->b_this_page)
1397 tmp = tmp->b_this_page;
1398 else
1399 break;
1401 tmp->b_this_page = bh;
1402 free_list[isize] = bh;
1403 mem_map[MAP_NR(page)].buffers = bh;
1404 buffermem += PAGE_SIZE;
1405 return 1;
1409 * Can the buffer be thrown out?
1411 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1412 #define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1415 * try_to_free_buffers() checks if all the buffers on this particular page
1416 * are unused, and free's the page if so.
1418 * Wake up bdflush() if this fails - if we're running low on memory due
1419 * to dirty buffers, we need to flush them out as quickly as possible.
1421 int try_to_free_buffers(struct page * page_map)
1423 struct buffer_head * tmp, * bh = page_map->buffers;
1425 tmp = bh;
1426 do {
1427 struct buffer_head * p = tmp;
1429 tmp = tmp->b_this_page;
1430 if (!buffer_busy(p))
1431 continue;
1433 wakeup_bdflush(0);
1434 return 0;
1435 } while (tmp != bh);
1437 tmp = bh;
1438 do {
1439 struct buffer_head * p = tmp;
1440 tmp = tmp->b_this_page;
1441 nr_buffers--;
1442 remove_from_queues(p);
1443 put_unused_buffer_head(p);
1444 } while (tmp != bh);
1446 /* Wake up anyone waiting for buffer heads */
1447 wake_up(&buffer_wait);
1449 /* And free the page */
1450 buffermem -= PAGE_SIZE;
1451 page_map->buffers = NULL;
1452 __free_page(page_map);
1453 return 1;
1456 /* ================== Debugging =================== */
1458 void show_buffers(void)
1460 struct buffer_head * bh;
1461 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1462 int protected = 0;
1463 int nlist;
1464 static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1466 printk("Buffer memory: %6dkB\n",buffermem>>10);
1467 printk("Buffer heads: %6d\n",nr_buffer_heads);
1468 printk("Buffer blocks: %6d\n",nr_buffers);
1469 printk("Buffer hashed: %6d\n",nr_hashed_buffers);
1471 for(nlist = 0; nlist < NR_LIST; nlist++) {
1472 found = locked = dirty = used = lastused = protected = 0;
1473 bh = lru_list[nlist];
1474 if(!bh) continue;
1476 do {
1477 found++;
1478 if (buffer_locked(bh))
1479 locked++;
1480 if (buffer_protected(bh))
1481 protected++;
1482 if (buffer_dirty(bh))
1483 dirty++;
1484 if (bh->b_count)
1485 used++, lastused = found;
1486 bh = bh->b_next_free;
1487 } while (bh != lru_list[nlist]);
1488 printk("%8s: %d buffers, %d used (last=%d), "
1489 "%d locked, %d protected, %d dirty\n",
1490 buf_types[nlist], found, used, lastused,
1491 locked, protected, dirty);
1496 /* ===================== Init ======================= */
1499 * allocate the hash table and init the free list
1500 * Use gfp() for the hash table to decrease TLB misses, use
1501 * SLAB cache for buffer heads.
1503 void __init buffer_init(unsigned long memory_size)
1505 int order;
1506 unsigned int nr_hash;
1508 /* we need to guess at the right sort of size for a buffer cache.
1509 the heuristic from working with large databases and getting
1510 fsync times (ext2) manageable, is the following */
1512 memory_size >>= 20;
1513 for (order = 5; (1UL << order) < memory_size; order++);
1515 /* try to allocate something until we get it or we're asking
1516 for something that is really too small */
1518 do {
1519 nr_hash = (1UL << order) * PAGE_SIZE /
1520 sizeof(struct buffer_head *);
1521 hash_table = (struct buffer_head **)
1522 __get_free_pages(GFP_ATOMIC, order);
1523 } while (hash_table == NULL && --order > 4);
1525 if (!hash_table)
1526 panic("Failed to allocate buffer hash table\n");
1527 memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1528 bh_hash_mask = nr_hash-1;
1530 bh_cachep = kmem_cache_create("buffer_head",
1531 sizeof(struct buffer_head),
1533 SLAB_HWCACHE_ALIGN, NULL, NULL);
1534 if(!bh_cachep)
1535 panic("Cannot create buffer head SLAB cache\n");
1537 * Allocate the reserved buffer heads.
1539 while (nr_buffer_heads < NR_RESERVED) {
1540 struct buffer_head * bh;
1542 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1543 if (!bh)
1544 break;
1545 put_unused_buffer_head(bh);
1546 nr_buffer_heads++;
1549 lru_list[BUF_CLEAN] = 0;
1550 grow_buffers(BLOCK_SIZE);
1554 /* ====================== bdflush support =================== */
1556 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1557 * response to dirty buffers. Once this process is activated, we write back
1558 * a limited number of buffers to the disks and then go back to sleep again.
1560 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1561 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1562 struct task_struct *bdflush_tsk = 0;
1564 void wakeup_bdflush(int wait)
1566 if (current == bdflush_tsk)
1567 return;
1568 wake_up(&bdflush_wait);
1569 if (wait) {
1570 run_task_queue(&tq_disk);
1571 sleep_on(&bdflush_done);
1577 * Here we attempt to write back old buffers. We also try to flush inodes
1578 * and supers as well, since this function is essentially "update", and
1579 * otherwise there would be no way of ensuring that these quantities ever
1580 * get written back. Ideally, we would have a timestamp on the inodes
1581 * and superblocks so that we could write back only the old ones as well
1584 static int sync_old_buffers(void)
1586 int i;
1587 int ndirty, nwritten;
1588 int nlist;
1589 int ncount;
1590 struct buffer_head * bh, *next;
1592 sync_supers(0);
1593 sync_inodes(0);
1595 ncount = 0;
1596 #ifdef DEBUG
1597 for(nlist = 0; nlist < NR_LIST; nlist++)
1598 #else
1599 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1600 #endif
1602 ndirty = 0;
1603 nwritten = 0;
1604 repeat:
1606 bh = lru_list[nlist];
1607 if(bh)
1608 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1609 /* We may have stalled while waiting for I/O to complete. */
1610 if(bh->b_list != nlist) goto repeat;
1611 next = bh->b_next_free;
1612 if(!lru_list[nlist]) {
1613 printk("Dirty list empty %d\n", i);
1614 break;
1617 /* Clean buffer on dirty list? Refile it */
1618 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
1619 refile_buffer(bh);
1620 continue;
1623 /* Unlocked buffer on locked list? Refile it */
1624 if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1625 refile_buffer(bh);
1626 continue;
1629 if (buffer_locked(bh) || !buffer_dirty(bh))
1630 continue;
1631 ndirty++;
1632 if(time_before(jiffies, bh->b_flushtime))
1633 continue;
1634 nwritten++;
1635 next->b_count++;
1636 bh->b_count++;
1637 bh->b_flushtime = 0;
1638 #ifdef DEBUG
1639 if(nlist != BUF_DIRTY) ncount++;
1640 #endif
1641 ll_rw_block(WRITE, 1, &bh);
1642 bh->b_count--;
1643 next->b_count--;
1646 run_task_queue(&tq_disk);
1647 #ifdef DEBUG
1648 if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1649 printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1650 #endif
1651 run_task_queue(&tq_disk);
1652 return 0;
1656 /* This is the interface to bdflush. As we get more sophisticated, we can
1657 * pass tuning parameters to this "process", to adjust how it behaves.
1658 * We would want to verify each parameter, however, to make sure that it
1659 * is reasonable. */
1661 asmlinkage int sys_bdflush(int func, long data)
1663 int i, error = -EPERM;
1665 lock_kernel();
1666 if (!capable(CAP_SYS_ADMIN))
1667 goto out;
1669 if (func == 1) {
1670 error = sync_old_buffers();
1671 goto out;
1674 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1675 if (func >= 2) {
1676 i = (func-2) >> 1;
1677 error = -EINVAL;
1678 if (i < 0 || i >= N_PARAM)
1679 goto out;
1680 if((func & 1) == 0) {
1681 error = put_user(bdf_prm.data[i], (int*)data);
1682 goto out;
1684 if (data < bdflush_min[i] || data > bdflush_max[i])
1685 goto out;
1686 bdf_prm.data[i] = data;
1687 error = 0;
1688 goto out;
1691 /* Having func 0 used to launch the actual bdflush and then never
1692 * return (unless explicitly killed). We return zero here to
1693 * remain semi-compatible with present update(8) programs.
1695 error = 0;
1696 out:
1697 unlock_kernel();
1698 return error;
1701 /* This is the actual bdflush daemon itself. It used to be started from
1702 * the syscall above, but now we launch it ourselves internally with
1703 * kernel_thread(...) directly after the first thread in init/main.c */
1705 /* To prevent deadlocks for a loop device:
1706 * 1) Do non-blocking writes to loop (avoids deadlock with running
1707 * out of request blocks).
1708 * 2) But do a blocking write if the only dirty buffers are loop buffers
1709 * (otherwise we go into an infinite busy-loop).
1710 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1711 * with running out of free buffers for loop's "real" device).
1713 int bdflush(void * unused)
1715 int i;
1716 int ndirty;
1717 int nlist;
1718 int ncount;
1719 struct buffer_head * bh, *next;
1720 int major;
1721 int wrta_cmd = WRITEA; /* non-blocking write for LOOP */
1724 * We have a bare-bones task_struct, and really should fill
1725 * in a few more things so "top" and /proc/2/{exe,root,cwd}
1726 * display semi-sane things. Not real crucial though...
1729 current->session = 1;
1730 current->pgrp = 1;
1731 sprintf(current->comm, "kflushd");
1732 bdflush_tsk = current;
1735 * As a kernel thread we want to tamper with system buffers
1736 * and other internals and thus be subject to the SMP locking
1737 * rules. (On a uniprocessor box this does nothing).
1739 lock_kernel();
1741 for (;;) {
1742 #ifdef DEBUG
1743 printk("bdflush() activated...");
1744 #endif
1746 CHECK_EMERGENCY_SYNC
1748 ncount = 0;
1749 #ifdef DEBUG
1750 for(nlist = 0; nlist < NR_LIST; nlist++)
1751 #else
1752 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1753 #endif
1755 ndirty = 0;
1756 repeat:
1758 bh = lru_list[nlist];
1759 if(bh)
1760 for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1761 bh = next) {
1762 /* We may have stalled while waiting for I/O to complete. */
1763 if(bh->b_list != nlist) goto repeat;
1764 next = bh->b_next_free;
1765 if(!lru_list[nlist]) {
1766 printk("Dirty list empty %d\n", i);
1767 break;
1770 /* Clean buffer on dirty list? Refile it */
1771 if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
1772 refile_buffer(bh);
1773 continue;
1776 /* Unlocked buffer on locked list? Refile it */
1777 if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1778 refile_buffer(bh);
1779 continue;
1782 if (buffer_locked(bh) || !buffer_dirty(bh))
1783 continue;
1784 major = MAJOR(bh->b_dev);
1785 /* Should we write back buffers that are shared or not??
1786 currently dirty buffers are not shared, so it does not matter */
1787 next->b_count++;
1788 bh->b_count++;
1789 ndirty++;
1790 bh->b_flushtime = 0;
1791 if (major == LOOP_MAJOR) {
1792 ll_rw_block(wrta_cmd,1, &bh);
1793 wrta_cmd = WRITEA;
1794 if (buffer_dirty(bh))
1795 --ndirty;
1797 else
1798 ll_rw_block(WRITE, 1, &bh);
1799 #ifdef DEBUG
1800 if(nlist != BUF_DIRTY) ncount++;
1801 #endif
1802 bh->b_count--;
1803 next->b_count--;
1806 #ifdef DEBUG
1807 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
1808 printk("sleeping again.\n");
1809 #endif
1810 /* If we didn't write anything, but there are still
1811 * dirty buffers, then make the next write to a
1812 * loop device to be a blocking write.
1813 * This lets us block--which we _must_ do! */
1814 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1815 wrta_cmd = WRITE;
1816 continue;
1818 run_task_queue(&tq_disk);
1819 wake_up(&bdflush_done);
1821 /* If there are still a lot of dirty buffers around, skip the sleep
1822 and flush some more */
1823 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
1824 spin_lock_irq(&current->sigmask_lock);
1825 flush_signals(current);
1826 spin_unlock_irq(&current->sigmask_lock);
1828 interruptible_sleep_on(&bdflush_wait);