Import 2.3.1pre1
[davej-history.git] / fs / buffer.c
bloba8fcbea521a4761ba6e6cf86a0abf2cacaa07d21
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 #include <linux/malloc.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/swap.h>
31 #include <linux/swapctl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/vmalloc.h>
34 #include <linux/blkdev.h>
35 #include <linux/sysrq.h>
36 #include <linux/file.h>
37 #include <linux/init.h>
38 #include <linux/quotaops.h>
40 #include <asm/uaccess.h>
41 #include <asm/io.h>
42 #include <asm/bitops.h>
44 #define NR_SIZES 7
45 static char buffersize_index[65] =
46 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
47 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
48 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
50 6};
52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
56 number of unused buffer heads */
59 * Hash table mask..
61 static unsigned long bh_hash_mask = 0;
63 static int grow_buffers(int size);
65 static struct buffer_head ** hash_table;
66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
69 static kmem_cache_t *bh_cachep;
71 static struct buffer_head * unused_list = NULL;
72 static struct buffer_head * reuse_list = NULL;
73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
75 static int nr_buffers = 0;
76 static int nr_buffers_type[NR_LIST] = {0,};
77 static int nr_buffer_heads = 0;
78 static int nr_unused_buffer_heads = 0;
79 static int nr_hashed_buffers = 0;
81 /* This is used by some architectures to estimate available memory. */
82 int buffermem = 0;
84 /* Here is the parameter block for the bdflush process. If you add or
85 * remove any of the parameters, make sure to update kernel/sysctl.c.
88 #define N_PARAM 9
90 /* The dummy values in this structure are left in there for compatibility
91 * with old programs that play with the /proc entries.
93 union bdflush_param{
94 struct {
95 int nfract; /* Percentage of buffer cache dirty to
96 activate bdflush */
97 int ndirty; /* Maximum number of dirty blocks to write out per
98 wake-cycle */
99 int nrefill; /* Number of clean buffers to try to obtain
100 each time we call refill */
101 int nref_dirt; /* Dirty buffer threshold for activating bdflush
102 when trying to refill buffers. */
103 int interval; /* Interval (seconds) between spontaneous
104 bdflush runs */
105 int age_buffer; /* Time for normal buffer to age before
106 we flush it */
107 int age_super; /* Time for superblock to age before we
108 flush it */
109 int dummy2; /* unused */
110 int dummy3; /* unused */
111 } b_un;
112 unsigned int data[N_PARAM];
113 } bdf_prm = {{40, 500, 64, 256, 5, 30*HZ, 5*HZ, 1884, 2}};
115 /* These are the min and max parameter values that we will allow to be assigned */
116 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 1, 1*HZ, 1*HZ, 1, 1};
117 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
119 void wakeup_bdflush(int);
122 * Rewrote the wait-routines to use the "new" wait-queue functionality,
123 * and getting rid of the cli-sti pairs. The wait-queue routines still
124 * need cli-sti, but now it's just a couple of 386 instructions or so.
126 * Note that the real wait_on_buffer() is an inline function that checks
127 * if 'b_wait' is set before calling this, so that the queues aren't set
128 * up unnecessarily.
130 void __wait_on_buffer(struct buffer_head * bh)
132 struct task_struct *tsk = current;
133 DECLARE_WAITQUEUE(wait, tsk);
135 bh->b_count++;
136 add_wait_queue(&bh->b_wait, &wait);
137 repeat:
138 tsk->state = TASK_UNINTERRUPTIBLE;
139 run_task_queue(&tq_disk);
140 if (buffer_locked(bh)) {
141 schedule();
142 goto repeat;
144 tsk->state = TASK_RUNNING;
145 remove_wait_queue(&bh->b_wait, &wait);
146 bh->b_count--;
149 /* Call sync_buffers with wait!=0 to ensure that the call does not
150 * return until all buffer writes have completed. Sync() may return
151 * before the writes have finished; fsync() may not.
154 /* Godamity-damn. Some buffers (bitmaps for filesystems)
155 * spontaneously dirty themselves without ever brelse being called.
156 * We will ultimately want to put these in a separate list, but for
157 * now we search all of the lists for dirty buffers.
159 static int sync_buffers(kdev_t dev, int wait)
161 int i, retry, pass = 0, err = 0;
162 struct buffer_head * bh, *next;
164 /* One pass for no-wait, three for wait:
165 * 0) write out all dirty, unlocked buffers;
166 * 1) write out all dirty buffers, waiting if locked;
167 * 2) wait for completion by waiting for all buffers to unlock.
169 do {
170 retry = 0;
171 repeat:
172 /* We search all lists as a failsafe mechanism, not because we expect
173 * there to be dirty buffers on any of the other lists.
175 bh = lru_list[BUF_DIRTY];
176 if (!bh)
177 goto repeat2;
178 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
179 if (bh->b_list != BUF_DIRTY)
180 goto repeat;
181 next = bh->b_next_free;
182 if (!lru_list[BUF_DIRTY])
183 break;
184 if (dev && bh->b_dev != dev)
185 continue;
186 if (buffer_locked(bh)) {
187 /* Buffer is locked; skip it unless wait is
188 * requested AND pass > 0.
190 if (!wait || !pass) {
191 retry = 1;
192 continue;
194 wait_on_buffer (bh);
195 goto repeat;
198 /* If an unlocked buffer is not uptodate, there has
199 * been an IO error. Skip it.
201 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
202 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
203 err = -EIO;
204 continue;
207 /* Don't write clean buffers. Don't write ANY buffers
208 * on the third pass.
210 if (!buffer_dirty(bh) || pass >= 2)
211 continue;
213 /* Don't bother about locked buffers.
215 * XXX We checked if it was locked above and there is no
216 * XXX way we could have slept in between. -DaveM
218 if (buffer_locked(bh))
219 continue;
220 bh->b_count++;
221 next->b_count++;
222 bh->b_flushtime = 0;
223 ll_rw_block(WRITE, 1, &bh);
224 bh->b_count--;
225 next->b_count--;
226 retry = 1;
229 repeat2:
230 bh = lru_list[BUF_LOCKED];
231 if (!bh)
232 break;
233 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
234 if (bh->b_list != BUF_LOCKED)
235 goto repeat2;
236 next = bh->b_next_free;
237 if (!lru_list[BUF_LOCKED])
238 break;
239 if (dev && bh->b_dev != dev)
240 continue;
241 if (buffer_locked(bh)) {
242 /* Buffer is locked; skip it unless wait is
243 * requested AND pass > 0.
245 if (!wait || !pass) {
246 retry = 1;
247 continue;
249 wait_on_buffer (bh);
250 goto repeat2;
254 /* If we are waiting for the sync to succeed, and if any dirty
255 * blocks were written, then repeat; on the second pass, only
256 * wait for buffers being written (do not pass to write any
257 * more buffers on the second pass).
259 } while (wait && retry && ++pass<=2);
260 return err;
263 void sync_dev(kdev_t dev)
265 sync_buffers(dev, 0);
266 sync_supers(dev);
267 sync_inodes(dev);
268 sync_buffers(dev, 0);
269 DQUOT_SYNC(dev);
271 * FIXME(eric) we need to sync the physical devices here.
272 * This is because some (scsi) controllers have huge amounts of
273 * cache onboard (hundreds of Mb), and we need to instruct
274 * them to commit all of the dirty memory to disk, and we should
275 * not return until this has happened.
277 * This would need to get implemented by going through the assorted
278 * layers so that each block major number can be synced, and this
279 * would call down into the upper and mid-layer scsi.
283 int fsync_dev(kdev_t dev)
285 sync_buffers(dev, 0);
286 sync_supers(dev);
287 sync_inodes(dev);
288 DQUOT_SYNC(dev);
289 return sync_buffers(dev, 1);
292 asmlinkage int sys_sync(void)
294 lock_kernel();
295 fsync_dev(0);
296 unlock_kernel();
297 return 0;
301 * filp may be NULL if called via the msync of a vma.
304 int file_fsync(struct file *filp, struct dentry *dentry)
306 struct inode * inode = dentry->d_inode;
307 struct super_block * sb;
308 kdev_t dev;
310 /* sync the inode to buffers */
311 write_inode_now(inode);
313 /* sync the superblock to buffers */
314 sb = inode->i_sb;
315 wait_on_super(sb);
316 if (sb->s_op && sb->s_op->write_super)
317 sb->s_op->write_super(sb);
319 /* .. finally sync the buffers to disk */
320 dev = inode->i_dev;
321 return sync_buffers(dev, 1);
324 asmlinkage int sys_fsync(unsigned int fd)
326 struct file * file;
327 struct dentry * dentry;
328 struct inode * inode;
329 int err;
331 lock_kernel();
332 err = -EBADF;
333 file = fget(fd);
334 if (!file)
335 goto out;
337 dentry = file->f_dentry;
338 if (!dentry)
339 goto out_putf;
341 inode = dentry->d_inode;
342 if (!inode)
343 goto out_putf;
345 err = -EINVAL;
346 if (!file->f_op || !file->f_op->fsync)
347 goto out_putf;
349 /* We need to protect against concurrent writers.. */
350 down(&inode->i_sem);
351 err = file->f_op->fsync(file, dentry);
352 up(&inode->i_sem);
354 out_putf:
355 fput(file);
356 out:
357 unlock_kernel();
358 return err;
361 asmlinkage int sys_fdatasync(unsigned int fd)
363 struct file * file;
364 struct dentry * dentry;
365 struct inode * inode;
366 int err;
368 lock_kernel();
369 err = -EBADF;
370 file = fget(fd);
371 if (!file)
372 goto out;
374 dentry = file->f_dentry;
375 if (!dentry)
376 goto out_putf;
378 inode = dentry->d_inode;
379 if (!inode)
380 goto out_putf;
382 err = -EINVAL;
383 if (!file->f_op || !file->f_op->fsync)
384 goto out_putf;
386 /* this needs further work, at the moment it is identical to fsync() */
387 down(&inode->i_sem);
388 err = file->f_op->fsync(file, dentry);
389 up(&inode->i_sem);
391 out_putf:
392 fput(file);
393 out:
394 unlock_kernel();
395 return err;
398 void invalidate_buffers(kdev_t dev)
400 int i;
401 int nlist;
402 struct buffer_head * bh;
404 for(nlist = 0; nlist < NR_LIST; nlist++) {
405 bh = lru_list[nlist];
406 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
407 if (bh->b_dev != dev)
408 continue;
409 wait_on_buffer(bh);
410 if (bh->b_dev != dev)
411 continue;
412 if (bh->b_count)
413 continue;
414 bh->b_flushtime = 0;
415 clear_bit(BH_Protected, &bh->b_state);
416 clear_bit(BH_Uptodate, &bh->b_state);
417 clear_bit(BH_Dirty, &bh->b_state);
418 clear_bit(BH_Req, &bh->b_state);
423 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
424 #define hash(dev,block) hash_table[_hashfn(dev,block)]
426 static inline void remove_from_hash_queue(struct buffer_head * bh)
428 struct buffer_head **pprev = bh->b_pprev;
429 if (pprev) {
430 struct buffer_head * next = bh->b_next;
431 if (next) {
432 next->b_pprev = pprev;
433 bh->b_next = NULL;
435 *pprev = next;
436 bh->b_pprev = NULL;
438 nr_hashed_buffers--;
441 static inline void remove_from_lru_list(struct buffer_head * bh)
443 if (!(bh->b_prev_free) || !(bh->b_next_free))
444 panic("VFS: LRU block list corrupted");
445 if (bh->b_dev == B_FREE)
446 panic("LRU list corrupted");
447 bh->b_prev_free->b_next_free = bh->b_next_free;
448 bh->b_next_free->b_prev_free = bh->b_prev_free;
450 if (lru_list[bh->b_list] == bh)
451 lru_list[bh->b_list] = bh->b_next_free;
452 if (lru_list[bh->b_list] == bh)
453 lru_list[bh->b_list] = NULL;
454 bh->b_next_free = bh->b_prev_free = NULL;
457 static inline void remove_from_free_list(struct buffer_head * bh)
459 int isize = BUFSIZE_INDEX(bh->b_size);
460 if (!(bh->b_prev_free) || !(bh->b_next_free))
461 panic("VFS: Free block list corrupted");
462 if(bh->b_dev != B_FREE)
463 panic("Free list corrupted");
464 if(!free_list[isize])
465 panic("Free list empty");
466 if(bh->b_next_free == bh)
467 free_list[isize] = NULL;
468 else {
469 bh->b_prev_free->b_next_free = bh->b_next_free;
470 bh->b_next_free->b_prev_free = bh->b_prev_free;
471 if (free_list[isize] == bh)
472 free_list[isize] = bh->b_next_free;
474 bh->b_next_free = bh->b_prev_free = NULL;
477 static void remove_from_queues(struct buffer_head * bh)
479 if(bh->b_dev == B_FREE) {
480 remove_from_free_list(bh); /* Free list entries should not be
481 in the hash queue */
482 return;
484 nr_buffers_type[bh->b_list]--;
485 remove_from_hash_queue(bh);
486 remove_from_lru_list(bh);
489 static inline void put_last_free(struct buffer_head * bh)
491 if (bh) {
492 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
494 bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
496 /* Add to back of free list. */
497 if(!*bhp) {
498 *bhp = bh;
499 bh->b_prev_free = bh;
502 bh->b_next_free = *bhp;
503 bh->b_prev_free = (*bhp)->b_prev_free;
504 (*bhp)->b_prev_free->b_next_free = bh;
505 (*bhp)->b_prev_free = bh;
509 static void insert_into_queues(struct buffer_head * bh)
511 /* put at end of free list */
512 if(bh->b_dev == B_FREE) {
513 put_last_free(bh);
514 } else {
515 struct buffer_head **bhp = &lru_list[bh->b_list];
517 if(!*bhp) {
518 *bhp = bh;
519 bh->b_prev_free = bh;
522 if (bh->b_next_free)
523 panic("VFS: buffer LRU pointers corrupted");
525 bh->b_next_free = *bhp;
526 bh->b_prev_free = (*bhp)->b_prev_free;
527 (*bhp)->b_prev_free->b_next_free = bh;
528 (*bhp)->b_prev_free = bh;
530 nr_buffers_type[bh->b_list]++;
532 /* Put the buffer in new hash-queue if it has a device. */
533 bh->b_next = NULL;
534 bh->b_pprev = NULL;
535 if (bh->b_dev) {
536 struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
537 struct buffer_head *next = *bhp;
539 if (next) {
540 bh->b_next = next;
541 next->b_pprev = &bh->b_next;
543 *bhp = bh;
544 bh->b_pprev = bhp;
546 nr_hashed_buffers++;
550 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
552 struct buffer_head * next;
554 next = hash(dev,block);
555 for (;;) {
556 struct buffer_head *tmp = next;
557 if (!next)
558 break;
559 next = tmp->b_next;
560 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
561 continue;
562 next = tmp;
563 break;
565 return next;
569 * Why like this, I hear you say... The reason is race-conditions.
570 * As we don't lock buffers (unless we are reading them, that is),
571 * something might happen to it while we sleep (ie a read-error
572 * will force it bad). This shouldn't really happen currently, but
573 * the code is ready.
575 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
577 struct buffer_head * bh;
578 bh = find_buffer(dev,block,size);
579 if (bh)
580 bh->b_count++;
581 return bh;
584 unsigned int get_hardblocksize(kdev_t dev)
587 * Get the hard sector size for the given device. If we don't know
588 * what it is, return 0.
590 if (hardsect_size[MAJOR(dev)] != NULL) {
591 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
592 if (blksize != 0)
593 return blksize;
597 * We don't know what the hardware sector size for this device is.
598 * Return 0 indicating that we don't know.
600 return 0;
603 void set_blocksize(kdev_t dev, int size)
605 extern int *blksize_size[];
606 int i, nlist;
607 struct buffer_head * bh, *bhnext;
609 if (!blksize_size[MAJOR(dev)])
610 return;
612 /* Size must be a power of two, and between 512 and PAGE_SIZE */
613 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
614 panic("Invalid blocksize passed to set_blocksize");
616 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
617 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
618 return;
620 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
621 return;
622 sync_buffers(dev, 2);
623 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
625 /* We need to be quite careful how we do this - we are moving entries
626 * around on the free list, and we can get in a loop if we are not careful.
628 for(nlist = 0; nlist < NR_LIST; nlist++) {
629 bh = lru_list[nlist];
630 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
631 if(!bh)
632 break;
634 bhnext = bh->b_next_free;
635 if (bh->b_dev != dev)
636 continue;
637 if (bh->b_size == size)
638 continue;
639 bhnext->b_count++;
640 wait_on_buffer(bh);
641 bhnext->b_count--;
642 if (bh->b_dev == dev && bh->b_size != size) {
643 clear_bit(BH_Dirty, &bh->b_state);
644 clear_bit(BH_Uptodate, &bh->b_state);
645 clear_bit(BH_Req, &bh->b_state);
646 bh->b_flushtime = 0;
648 remove_from_hash_queue(bh);
654 * We used to try various strange things. Let's not.
656 static void refill_freelist(int size)
658 if (!grow_buffers(size)) {
659 wakeup_bdflush(1);
660 current->policy |= SCHED_YIELD;
661 schedule();
665 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
666 bh_end_io_t *handler, void *dev_id)
668 bh->b_count = 1;
669 bh->b_list = BUF_CLEAN;
670 bh->b_flushtime = 0;
671 bh->b_dev = dev;
672 bh->b_blocknr = block;
673 bh->b_end_io = handler;
674 bh->b_dev_id = dev_id;
677 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
679 mark_buffer_uptodate(bh, uptodate);
680 unlock_buffer(bh);
684 * Ok, this is getblk, and it isn't very clear, again to hinder
685 * race-conditions. Most of the code is seldom used, (ie repeating),
686 * so it should be much more efficient than it looks.
688 * The algorithm is changed: hopefully better, and an elusive bug removed.
690 * 14.02.92: changed it to sync dirty buffers a bit: better performance
691 * when the filesystem starts to get full of dirty blocks (I hope).
693 struct buffer_head * getblk(kdev_t dev, int block, int size)
695 struct buffer_head * bh;
696 int isize;
698 repeat:
699 bh = get_hash_table(dev, block, size);
700 if (bh) {
701 if (!buffer_dirty(bh)) {
702 bh->b_flushtime = 0;
704 return bh;
707 isize = BUFSIZE_INDEX(size);
708 get_free:
709 bh = free_list[isize];
710 if (!bh)
711 goto refill;
712 remove_from_free_list(bh);
714 /* OK, FINALLY we know that this buffer is the only one of its kind,
715 * and that it's unused (b_count=0), unlocked, and clean.
717 init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
718 bh->b_state=0;
719 insert_into_queues(bh);
720 return bh;
723 * If we block while refilling the free list, somebody may
724 * create the buffer first ... search the hashes again.
726 refill:
727 refill_freelist(size);
728 if (!find_buffer(dev,block,size))
729 goto get_free;
730 goto repeat;
733 void set_writetime(struct buffer_head * buf, int flag)
735 int newtime;
737 if (buffer_dirty(buf)) {
738 /* Move buffer to dirty list if jiffies is clear. */
739 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
740 bdf_prm.b_un.age_buffer);
741 if(!buf->b_flushtime || buf->b_flushtime > newtime)
742 buf->b_flushtime = newtime;
743 } else {
744 buf->b_flushtime = 0;
750 * Put a buffer into the appropriate list, without side-effects.
752 static inline void file_buffer(struct buffer_head *bh, int list)
754 remove_from_queues(bh);
755 bh->b_list = list;
756 insert_into_queues(bh);
760 * A buffer may need to be moved from one buffer list to another
761 * (e.g. in case it is not shared any more). Handle this.
763 void refile_buffer(struct buffer_head * buf)
765 int dispose;
767 if(buf->b_dev == B_FREE) {
768 printk("Attempt to refile free buffer\n");
769 return;
771 if (buffer_dirty(buf))
772 dispose = BUF_DIRTY;
773 else if (buffer_locked(buf))
774 dispose = BUF_LOCKED;
775 else
776 dispose = BUF_CLEAN;
777 if(dispose != buf->b_list) {
778 file_buffer(buf, dispose);
779 if(dispose == BUF_DIRTY) {
780 int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
782 /* This buffer is dirty, maybe we need to start flushing.
783 * If too high a percentage of the buffers are dirty...
785 if (nr_buffers_type[BUF_DIRTY] > too_many)
786 wakeup_bdflush(1);
788 /* If this is a loop device, and
789 * more than half of the buffers are dirty...
790 * (Prevents no-free-buffers deadlock with loop device.)
792 if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
793 nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
794 wakeup_bdflush(1);
800 * Release a buffer head
802 void __brelse(struct buffer_head * buf)
804 /* If dirty, mark the time this buffer should be written back. */
805 set_writetime(buf, 0);
806 refile_buffer(buf);
807 touch_buffer(buf);
809 if (buf->b_count) {
810 buf->b_count--;
811 return;
813 printk("VFS: brelse: Trying to free free buffer\n");
817 * bforget() is like brelse(), except it puts the buffer on the
818 * free list if it can.. We can NOT free the buffer if:
819 * - there are other users of it
820 * - it is locked and thus can have active IO
822 void __bforget(struct buffer_head * buf)
824 if (buf->b_count != 1 || buffer_locked(buf)) {
825 __brelse(buf);
826 return;
828 buf->b_count = 0;
829 buf->b_state = 0;
830 remove_from_queues(buf);
831 put_last_free(buf);
835 * bread() reads a specified block and returns the buffer that contains
836 * it. It returns NULL if the block was unreadable.
838 struct buffer_head * bread(kdev_t dev, int block, int size)
840 struct buffer_head * bh;
842 bh = getblk(dev, block, size);
843 if (buffer_uptodate(bh))
844 return bh;
845 ll_rw_block(READ, 1, &bh);
846 wait_on_buffer(bh);
847 if (buffer_uptodate(bh))
848 return bh;
849 brelse(bh);
850 return NULL;
854 * Ok, breada can be used as bread, but additionally to mark other
855 * blocks for reading as well. End the argument list with a negative
856 * number.
859 #define NBUF 16
861 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
862 unsigned int pos, unsigned int filesize)
864 struct buffer_head * bhlist[NBUF];
865 unsigned int blocks;
866 struct buffer_head * bh;
867 int index;
868 int i, j;
870 if (pos >= filesize)
871 return NULL;
873 if (block < 0)
874 return NULL;
876 bh = getblk(dev, block, bufsize);
877 index = BUFSIZE_INDEX(bh->b_size);
879 if (buffer_uptodate(bh))
880 return(bh);
881 else ll_rw_block(READ, 1, &bh);
883 blocks = (filesize - pos) >> (9+index);
885 if (blocks < (read_ahead[MAJOR(dev)] >> index))
886 blocks = read_ahead[MAJOR(dev)] >> index;
887 if (blocks > NBUF)
888 blocks = NBUF;
890 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
893 bhlist[0] = bh;
894 j = 1;
895 for(i=1; i<blocks; i++) {
896 bh = getblk(dev,block+i,bufsize);
897 if (buffer_uptodate(bh)) {
898 brelse(bh);
899 break;
901 else bhlist[j++] = bh;
904 /* Request the read for these buffers, and then release them. */
905 if (j>1)
906 ll_rw_block(READA, (j-1), bhlist+1);
907 for(i=1; i<j; i++)
908 brelse(bhlist[i]);
910 /* Wait for this buffer, and then continue on. */
911 bh = bhlist[0];
912 wait_on_buffer(bh);
913 if (buffer_uptodate(bh))
914 return bh;
915 brelse(bh);
916 return NULL;
920 * Note: the caller should wake up the buffer_wait list if needed.
922 static void put_unused_buffer_head(struct buffer_head * bh)
924 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
925 nr_buffer_heads--;
926 kmem_cache_free(bh_cachep, bh);
927 return;
930 memset(bh,0,sizeof(*bh));
931 init_waitqueue_head(&bh->b_wait);
932 nr_unused_buffer_heads++;
933 bh->b_next_free = unused_list;
934 unused_list = bh;
938 * We can't put completed temporary IO buffer_heads directly onto the
939 * unused_list when they become unlocked, since the device driver
940 * end_request routines still expect access to the buffer_head's
941 * fields after the final unlock. So, the device driver puts them on
942 * the reuse_list instead once IO completes, and we recover these to
943 * the unused_list here.
945 * Note that we don't do a wakeup here, but return a flag indicating
946 * whether we got any buffer heads. A task ready to sleep can check
947 * the returned value, and any tasks already sleeping will have been
948 * awakened when the buffer heads were added to the reuse list.
950 static inline int recover_reusable_buffer_heads(void)
952 struct buffer_head *head = xchg(&reuse_list, NULL);
953 int found = 0;
955 if (head) {
956 do {
957 struct buffer_head *bh = head;
958 head = head->b_next_free;
959 put_unused_buffer_head(bh);
960 } while (head);
961 found = 1;
963 return found;
967 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
968 * no-buffer-head deadlock. Return NULL on failure; waiting for
969 * buffer heads is now handled in create_buffers().
971 static struct buffer_head * get_unused_buffer_head(int async)
973 struct buffer_head * bh;
975 recover_reusable_buffer_heads();
976 if (nr_unused_buffer_heads > NR_RESERVED) {
977 bh = unused_list;
978 unused_list = bh->b_next_free;
979 nr_unused_buffer_heads--;
980 return bh;
983 /* This is critical. We can't swap out pages to get
984 * more buffer heads, because the swap-out may need
985 * more buffer-heads itself. Thus SLAB_BUFFER.
987 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
988 memset(bh, 0, sizeof(*bh));
989 init_waitqueue_head(&bh->b_wait);
990 nr_buffer_heads++;
991 return bh;
995 * If we need an async buffer, use the reserved buffer heads.
997 if (async && unused_list) {
998 bh = unused_list;
999 unused_list = bh->b_next_free;
1000 nr_unused_buffer_heads--;
1001 return bh;
1004 #if 0
1006 * (Pending further analysis ...)
1007 * Ordinary (non-async) requests can use a different memory priority
1008 * to free up pages. Any swapping thus generated will use async
1009 * buffer heads.
1011 if(!async &&
1012 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1013 memset(bh, 0, sizeof(*bh));
1014 init_waitqueue_head(&bh->b_wait);
1015 nr_buffer_heads++;
1016 return bh;
1018 #endif
1020 return NULL;
1024 * Create the appropriate buffers when given a page for data area and
1025 * the size of each buffer.. Use the bh->b_this_page linked list to
1026 * follow the buffers created. Return NULL if unable to create more
1027 * buffers.
1028 * The async flag is used to differentiate async IO (paging, swapping)
1029 * from ordinary buffer allocations, and only async requests are allowed
1030 * to sleep waiting for buffer heads.
1032 static struct buffer_head * create_buffers(unsigned long page,
1033 unsigned long size, int async)
1035 DECLARE_WAITQUEUE(wait, current);
1036 struct buffer_head *bh, *head;
1037 long offset;
1039 try_again:
1040 head = NULL;
1041 offset = PAGE_SIZE;
1042 while ((offset -= size) >= 0) {
1043 bh = get_unused_buffer_head(async);
1044 if (!bh)
1045 goto no_grow;
1047 bh->b_dev = B_FREE; /* Flag as unused */
1048 bh->b_this_page = head;
1049 head = bh;
1051 bh->b_state = 0;
1052 bh->b_next_free = NULL;
1053 bh->b_count = 0;
1054 bh->b_size = size;
1056 bh->b_data = (char *) (page+offset);
1057 bh->b_list = 0;
1059 return head;
1061 * In case anything failed, we just free everything we got.
1063 no_grow:
1064 if (head) {
1065 do {
1066 bh = head;
1067 head = head->b_this_page;
1068 put_unused_buffer_head(bh);
1069 } while (head);
1071 /* Wake up any waiters ... */
1072 wake_up(&buffer_wait);
1076 * Return failure for non-async IO requests. Async IO requests
1077 * are not allowed to fail, so we have to wait until buffer heads
1078 * become available. But we don't want tasks sleeping with
1079 * partially complete buffers, so all were released above.
1081 if (!async)
1082 return NULL;
1084 /* We're _really_ low on memory. Now we just
1085 * wait for old buffer heads to become free due to
1086 * finishing IO. Since this is an async request and
1087 * the reserve list is empty, we're sure there are
1088 * async buffer heads in use.
1090 run_task_queue(&tq_disk);
1093 * Set our state for sleeping, then check again for buffer heads.
1094 * This ensures we won't miss a wake_up from an interrupt.
1096 add_wait_queue(&buffer_wait, &wait);
1097 current->state = TASK_UNINTERRUPTIBLE;
1098 if (!recover_reusable_buffer_heads())
1099 schedule();
1100 remove_wait_queue(&buffer_wait, &wait);
1101 current->state = TASK_RUNNING;
1102 goto try_again;
1105 /* Run the hooks that have to be done when a page I/O has completed. */
1106 static inline void after_unlock_page (struct page * page)
1108 if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1109 atomic_dec(&nr_async_pages);
1110 #ifdef DEBUG_SWAP
1111 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1112 (char *) page_address(page),
1113 atomic_read(&nr_async_pages));
1114 #endif
1116 if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1117 swap_after_unlock_page(page->offset);
1118 if (test_and_clear_bit(PG_free_after, &page->flags))
1119 __free_page(page);
1123 * Free all temporary buffers belonging to a page.
1124 * This needs to be called with interrupts disabled.
1126 static inline void free_async_buffers (struct buffer_head * bh)
1128 struct buffer_head *tmp, *tail;
1131 * Link all the buffers into the b_next_free list,
1132 * so we only have to do one xchg() operation ...
1134 tail = bh;
1135 while ((tmp = tail->b_this_page) != bh) {
1136 tail->b_next_free = tmp;
1137 tail = tmp;
1140 /* Update the reuse list */
1141 tail->b_next_free = xchg(&reuse_list, NULL);
1142 reuse_list = bh;
1144 /* Wake up any waiters ... */
1145 wake_up(&buffer_wait);
1148 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1150 unsigned long flags;
1151 struct buffer_head *tmp;
1152 struct page *page;
1154 mark_buffer_uptodate(bh, uptodate);
1155 unlock_buffer(bh);
1157 /* This is a temporary buffer used for page I/O. */
1158 page = mem_map + MAP_NR(bh->b_data);
1159 if (!PageLocked(page))
1160 goto not_locked;
1161 if (bh->b_count != 1)
1162 goto bad_count;
1164 if (!test_bit(BH_Uptodate, &bh->b_state))
1165 set_bit(PG_error, &page->flags);
1168 * Be _very_ careful from here on. Bad things can happen if
1169 * two buffer heads end IO at almost the same time and both
1170 * decide that the page is now completely done.
1172 * Async buffer_heads are here only as labels for IO, and get
1173 * thrown away once the IO for this page is complete. IO is
1174 * deemed complete once all buffers have been visited
1175 * (b_count==0) and are now unlocked. We must make sure that
1176 * only the _last_ buffer that decrements its count is the one
1177 * that free's the page..
1179 save_flags(flags);
1180 cli();
1181 bh->b_count--;
1182 tmp = bh;
1183 do {
1184 if (tmp->b_count)
1185 goto still_busy;
1186 tmp = tmp->b_this_page;
1187 } while (tmp != bh);
1189 /* OK, the async IO on this page is complete. */
1190 free_async_buffers(bh);
1191 restore_flags(flags);
1192 clear_bit(PG_locked, &page->flags);
1193 wake_up(&page->wait);
1194 after_unlock_page(page);
1195 return;
1197 still_busy:
1198 restore_flags(flags);
1199 return;
1201 not_locked:
1202 printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1203 return;
1205 bad_count:
1206 printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1207 return;
1211 * Start I/O on a page.
1212 * This function expects the page to be locked and may return before I/O is complete.
1213 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1215 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1217 struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1218 int block, nr;
1220 if (!PageLocked(page))
1221 panic("brw_page: page not locked for I/O");
1222 clear_bit(PG_uptodate, &page->flags);
1223 clear_bit(PG_error, &page->flags);
1225 * Allocate async buffer heads pointing to this page, just for I/O.
1226 * They do _not_ show up in the buffer hash table!
1227 * They are _not_ registered in page->buffers either!
1229 bh = create_buffers(page_address(page), size, 1);
1230 if (!bh) {
1231 /* WSH: exit here leaves page->count incremented */
1232 clear_bit(PG_locked, &page->flags);
1233 wake_up(&page->wait);
1234 return -ENOMEM;
1236 nr = 0;
1237 next = bh;
1238 do {
1239 struct buffer_head * tmp;
1240 block = *(b++);
1242 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1243 set_bit(BH_Uptodate, &next->b_state);
1246 * When we use bmap, we define block zero to represent
1247 * a hole. ll_rw_page, however, may legitimately
1248 * access block zero, and we need to distinguish the
1249 * two cases.
1251 if (bmap && !block) {
1252 memset(next->b_data, 0, size);
1253 next->b_count--;
1254 continue;
1256 tmp = get_hash_table(dev, block, size);
1257 if (tmp) {
1258 if (!buffer_uptodate(tmp)) {
1259 if (rw == READ)
1260 ll_rw_block(READ, 1, &tmp);
1261 wait_on_buffer(tmp);
1263 if (rw == READ)
1264 memcpy(next->b_data, tmp->b_data, size);
1265 else {
1266 memcpy(tmp->b_data, next->b_data, size);
1267 mark_buffer_dirty(tmp, 0);
1269 brelse(tmp);
1270 next->b_count--;
1271 continue;
1273 if (rw == READ)
1274 clear_bit(BH_Uptodate, &next->b_state);
1275 else
1276 set_bit(BH_Dirty, &next->b_state);
1277 arr[nr++] = next;
1278 } while (prev = next, (next = next->b_this_page) != NULL);
1279 prev->b_this_page = bh;
1281 if (nr) {
1282 ll_rw_block(rw, nr, arr);
1283 /* The rest of the work is done in mark_buffer_uptodate()
1284 * and unlock_buffer(). */
1285 } else {
1286 unsigned long flags;
1287 clear_bit(PG_locked, &page->flags);
1288 set_bit(PG_uptodate, &page->flags);
1289 wake_up(&page->wait);
1290 save_flags(flags);
1291 cli();
1292 free_async_buffers(bh);
1293 restore_flags(flags);
1294 after_unlock_page(page);
1296 ++current->maj_flt;
1297 return 0;
1301 * This is called by end_request() when I/O has completed.
1303 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1305 if (on) {
1306 struct buffer_head *tmp = bh;
1307 set_bit(BH_Uptodate, &bh->b_state);
1308 /* If a page has buffers and all these buffers are uptodate,
1309 * then the page is uptodate. */
1310 do {
1311 if (!test_bit(BH_Uptodate, &tmp->b_state))
1312 return;
1313 tmp=tmp->b_this_page;
1314 } while (tmp && tmp != bh);
1315 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1316 return;
1318 clear_bit(BH_Uptodate, &bh->b_state);
1322 * Generic "readpage" function for block devices that have the normal
1323 * bmap functionality. This is most of the block device filesystems.
1324 * Reads the page asynchronously --- the unlock_buffer() and
1325 * mark_buffer_uptodate() functions propagate buffer state into the
1326 * page struct once IO has completed.
1328 int generic_readpage(struct file * file, struct page * page)
1330 struct dentry *dentry = file->f_dentry;
1331 struct inode *inode = dentry->d_inode;
1332 unsigned long block;
1333 int *p, nr[PAGE_SIZE/512];
1334 int i;
1336 atomic_inc(&page->count);
1337 set_bit(PG_locked, &page->flags);
1338 set_bit(PG_free_after, &page->flags);
1340 i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1341 block = page->offset >> inode->i_sb->s_blocksize_bits;
1342 p = nr;
1343 do {
1344 *p = inode->i_op->bmap(inode, block);
1345 i--;
1346 block++;
1347 p++;
1348 } while (i > 0);
1350 /* IO start */
1351 brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1352 return 0;
1356 * Try to increase the number of buffers available: the size argument
1357 * is used to determine what kind of buffers we want.
1359 static int grow_buffers(int size)
1361 unsigned long page;
1362 struct buffer_head *bh, *tmp;
1363 struct buffer_head * insert_point;
1364 int isize;
1366 if ((size & 511) || (size > PAGE_SIZE)) {
1367 printk("VFS: grow_buffers: size = %d\n",size);
1368 return 0;
1371 if (!(page = __get_free_page(GFP_BUFFER)))
1372 return 0;
1373 bh = create_buffers(page, size, 0);
1374 if (!bh) {
1375 free_page(page);
1376 return 0;
1379 isize = BUFSIZE_INDEX(size);
1380 insert_point = free_list[isize];
1382 tmp = bh;
1383 while (1) {
1384 if (insert_point) {
1385 tmp->b_next_free = insert_point->b_next_free;
1386 tmp->b_prev_free = insert_point;
1387 insert_point->b_next_free->b_prev_free = tmp;
1388 insert_point->b_next_free = tmp;
1389 } else {
1390 tmp->b_prev_free = tmp;
1391 tmp->b_next_free = tmp;
1393 insert_point = tmp;
1394 ++nr_buffers;
1395 if (tmp->b_this_page)
1396 tmp = tmp->b_this_page;
1397 else
1398 break;
1400 tmp->b_this_page = bh;
1401 free_list[isize] = bh;
1402 mem_map[MAP_NR(page)].buffers = bh;
1403 buffermem += PAGE_SIZE;
1404 return 1;
1408 * Can the buffer be thrown out?
1410 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1411 #define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1414 * try_to_free_buffers() checks if all the buffers on this particular page
1415 * are unused, and free's the page if so.
1417 * Wake up bdflush() if this fails - if we're running low on memory due
1418 * to dirty buffers, we need to flush them out as quickly as possible.
1420 int try_to_free_buffers(struct page * page_map)
1422 struct buffer_head * tmp, * bh = page_map->buffers;
1424 tmp = bh;
1425 do {
1426 struct buffer_head * p = tmp;
1428 tmp = tmp->b_this_page;
1429 if (!buffer_busy(p))
1430 continue;
1432 wakeup_bdflush(0);
1433 return 0;
1434 } while (tmp != bh);
1436 tmp = bh;
1437 do {
1438 struct buffer_head * p = tmp;
1439 tmp = tmp->b_this_page;
1440 nr_buffers--;
1441 remove_from_queues(p);
1442 put_unused_buffer_head(p);
1443 } while (tmp != bh);
1445 /* Wake up anyone waiting for buffer heads */
1446 wake_up(&buffer_wait);
1448 /* And free the page */
1449 buffermem -= PAGE_SIZE;
1450 page_map->buffers = NULL;
1451 __free_page(page_map);
1452 return 1;
1455 /* ================== Debugging =================== */
1457 void show_buffers(void)
1459 struct buffer_head * bh;
1460 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1461 int protected = 0;
1462 int nlist;
1463 static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1465 printk("Buffer memory: %6dkB\n",buffermem>>10);
1466 printk("Buffer heads: %6d\n",nr_buffer_heads);
1467 printk("Buffer blocks: %6d\n",nr_buffers);
1468 printk("Buffer hashed: %6d\n",nr_hashed_buffers);
1470 for(nlist = 0; nlist < NR_LIST; nlist++) {
1471 found = locked = dirty = used = lastused = protected = 0;
1472 bh = lru_list[nlist];
1473 if(!bh) continue;
1475 do {
1476 found++;
1477 if (buffer_locked(bh))
1478 locked++;
1479 if (buffer_protected(bh))
1480 protected++;
1481 if (buffer_dirty(bh))
1482 dirty++;
1483 if (bh->b_count)
1484 used++, lastused = found;
1485 bh = bh->b_next_free;
1486 } while (bh != lru_list[nlist]);
1487 printk("%8s: %d buffers, %d used (last=%d), "
1488 "%d locked, %d protected, %d dirty\n",
1489 buf_types[nlist], found, used, lastused,
1490 locked, protected, dirty);
1495 /* ===================== Init ======================= */
1498 * allocate the hash table and init the free list
1499 * Use gfp() for the hash table to decrease TLB misses, use
1500 * SLAB cache for buffer heads.
1502 void __init buffer_init(unsigned long memory_size)
1504 int order;
1505 unsigned int nr_hash;
1507 /* we need to guess at the right sort of size for a buffer cache.
1508 the heuristic from working with large databases and getting
1509 fsync times (ext2) manageable, is the following */
1511 memory_size >>= 20;
1512 for (order = 5; (1UL << order) < memory_size; order++);
1514 /* try to allocate something until we get it or we're asking
1515 for something that is really too small */
1517 do {
1518 nr_hash = (1UL << order) * PAGE_SIZE /
1519 sizeof(struct buffer_head *);
1520 hash_table = (struct buffer_head **)
1521 __get_free_pages(GFP_ATOMIC, order);
1522 } while (hash_table == NULL && --order > 4);
1524 if (!hash_table)
1525 panic("Failed to allocate buffer hash table\n");
1526 memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1527 bh_hash_mask = nr_hash-1;
1529 bh_cachep = kmem_cache_create("buffer_head",
1530 sizeof(struct buffer_head),
1532 SLAB_HWCACHE_ALIGN, NULL, NULL);
1533 if(!bh_cachep)
1534 panic("Cannot create buffer head SLAB cache\n");
1536 * Allocate the reserved buffer heads.
1538 while (nr_buffer_heads < NR_RESERVED) {
1539 struct buffer_head * bh;
1541 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1542 if (!bh)
1543 break;
1544 put_unused_buffer_head(bh);
1545 nr_buffer_heads++;
1548 lru_list[BUF_CLEAN] = 0;
1549 grow_buffers(BLOCK_SIZE);
1553 /* ====================== bdflush support =================== */
1555 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1556 * response to dirty buffers. Once this process is activated, we write back
1557 * a limited number of buffers to the disks and then go back to sleep again.
1559 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1560 struct task_struct *bdflush_tsk = 0;
1562 void wakeup_bdflush(int wait)
1564 if (current == bdflush_tsk)
1565 return;
1566 wake_up_process(bdflush_tsk);
1567 if (wait) {
1568 run_task_queue(&tq_disk);
1569 sleep_on(&bdflush_done);
1575 * Here we attempt to write back old buffers.
1576 * To prevent deadlocks for a loop device:
1577 * 1) Do non-blocking writes to loop (avoids deadlock with running
1578 * out of request blocks).
1579 * 2) But do a blocking write if the only dirty buffers are loop buffers
1580 * (otherwise we go into an infinite busy-loop).
1581 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1582 * with running out of free buffers for loop's "real" device).
1585 static inline void sync_old_buffers(void)
1587 int i;
1588 int ndirty = 0;
1589 int wrta_cmd = WRITEA;
1590 #ifdef DEBUG
1591 int ncount = 0, nwritten = 0;
1592 #endif
1593 struct buffer_head * bh, *next;
1595 #ifdef DEBUG
1596 bh = lru_list[BUF_CLEAN];
1597 if(bh)
1598 for(i = nr_buffers_type[BUF_CLEAN]; --i > 0; bh = next) {
1599 next = bh->b_next_free;
1601 /* Dirty/locked buffer on clean list? Refile it */
1602 if (buffer_locked(bh) || buffer_dirty(bh)) {
1603 ncount++;
1604 refile_buffer(bh);
1607 #endif
1609 bh = lru_list[BUF_LOCKED];
1610 if(bh)
1611 for(i = nr_buffers_type[BUF_LOCKED]; --i > 0; bh = next) {
1612 next = bh->b_next_free;
1614 /* Unlocked buffer on locked list? Refile it */
1615 if (!buffer_locked(bh))
1616 refile_buffer(bh);
1619 restart:
1620 bh = lru_list[BUF_DIRTY];
1621 if(bh)
1622 for (i = nr_buffers_type[BUF_DIRTY];
1623 i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1624 bh = next) {
1625 /* We may have stalled while waiting for
1626 I/O to complete. */
1627 if(bh->b_list != BUF_DIRTY)
1628 goto restart;
1629 next = bh->b_next_free;
1630 if(!lru_list[BUF_DIRTY]) {
1631 printk("Dirty list empty %d\n", i);
1632 break;
1635 /* Clean buffer on dirty list? Refile it */
1636 if (!buffer_dirty(bh)) {
1637 refile_buffer(bh);
1638 continue;
1641 if (buffer_locked(bh))
1642 continue;
1643 /* Should we write back buffers that are
1644 shared or not?? Currently dirty buffers
1645 are not shared, so it does not matter */
1646 next->b_count++;
1647 bh->b_count++;
1648 ndirty++;
1649 bh->b_flushtime = 0;
1650 if (MAJOR(bh->b_dev) == LOOP_MAJOR) {
1651 ll_rw_block(wrta_cmd,1, &bh);
1652 wrta_cmd = WRITEA;
1653 if (buffer_dirty(bh))
1654 --ndirty;
1656 else
1657 ll_rw_block(WRITE, 1, &bh);
1658 bh->b_count--;
1659 next->b_count--;
1661 /* If we didn't write anything, but there are still
1662 * dirty buffers, then make the next write to a
1663 * loop device to be a blocking write.
1664 * This lets us block--which we _must_ do! */
1665 if (ndirty == 0
1666 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1667 wrta_cmd = WRITE;
1668 goto restart;
1671 #ifdef DEBUG
1672 if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1673 printk("wrote %d/%d buffers...", nwritten, ndirty);
1674 #endif
1675 run_task_queue(&tq_disk);
1679 /* This is the interface to bdflush. As we get more sophisticated, we can
1680 * pass tuning parameters to this "process", to adjust how it behaves.
1681 * We would want to verify each parameter, however, to make sure that it
1682 * is reasonable. */
1684 asmlinkage int sys_bdflush(int func, long data)
1686 int i, error = -EPERM;
1688 lock_kernel();
1689 if (!capable(CAP_SYS_ADMIN))
1690 goto out;
1692 if (func == 1)
1693 /* Func 1 used to call sync_old_buffers; a user space
1694 daemon would call it periodically. This is no
1695 longer necessary. Returning -EPERM here makes the
1696 daemon silently exit. */
1697 goto out;
1699 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1700 if (func >= 2) {
1701 i = (func-2) >> 1;
1702 error = -EINVAL;
1703 if (i < 0 || i >= N_PARAM)
1704 goto out;
1705 if((func & 1) == 0) {
1706 error = put_user(bdf_prm.data[i], (int*)data);
1707 goto out;
1709 if (data < bdflush_min[i] || data > bdflush_max[i])
1710 goto out;
1711 bdf_prm.data[i] = data;
1712 error = 0;
1713 goto out;
1716 /* Having func 0 used to launch the actual bdflush and then never
1717 * return (unless explicitly killed). We return zero here to
1718 * remain semi-compatible with present update(8) programs.
1720 error = 0;
1721 out:
1722 unlock_kernel();
1723 return error;
1726 /* This is the actual bdflush daemon itself. It used to be started
1727 * from the syscall above, but now we launch it ourselves internally
1728 * with kernel_thread(...) directly after the first thread in
1729 * init/main.c. Every so often, or when woken up by another task that
1730 * needs memory, we call sync_old_buffers to partially clear the dirty list.
1733 int bdflush(void * unused)
1735 long remaining = HZ * bdf_prm.b_un.interval;
1736 struct task_struct *tsk = current;
1739 * We have a bare-bones task_struct, and really should fill
1740 * in a few more things so "top" and /proc/2/{exe,root,cwd}
1741 * display semi-sane things. Not real crucial though...
1744 tsk->session = 1;
1745 tsk->pgrp = 1;
1746 tsk->dumpable = 0; /* inhibit ptrace() */
1747 strcpy(tsk->comm, "kflushd");
1748 sigfillset(&tsk->blocked);
1749 bdflush_tsk = tsk;
1752 * As a kernel thread we want to tamper with system buffers
1753 * and other internals and thus be subject to the SMP locking
1754 * rules. (On a uniprocessor box this does nothing).
1756 lock_kernel();
1758 for (;;) {
1759 tsk->state = TASK_INTERRUPTIBLE;
1760 remaining = schedule_timeout(remaining);
1762 #ifdef DEBUG
1763 printk("bdflush() activated...");
1764 #endif
1765 CHECK_EMERGENCY_SYNC
1767 if (remaining == 0) {
1769 * Also try to flush inodes and supers, since
1770 * otherwise there would be no way of ensuring
1771 * that these quantities ever get written
1772 * back. Ideally, we would have a timestamp
1773 * on the inodes and superblocks so that we
1774 * could write back only the old ones.
1776 sync_supers(0);
1777 sync_inodes(0);
1778 remaining = HZ * bdf_prm.b_un.interval;
1781 /* Keep flushing till there aren't very many dirty buffers */
1782 do {
1783 sync_old_buffers();
1784 } while(nr_buffers_type[BUF_DIRTY] > nr_buffers * bdf_prm.b_un.nfract/100);
1786 wake_up(&bdflush_done);
1787 #ifdef DEBUG
1788 printk("sleeping again.\n");
1789 #endif