Linux 2.2.8
[davej-history.git] / fs / buffer.c
blobafec12e55a18529abaf84a58962fe6d11504367a
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 #include <linux/malloc.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/swap.h>
31 #include <linux/swapctl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/vmalloc.h>
34 #include <linux/blkdev.h>
35 #include <linux/sysrq.h>
36 #include <linux/file.h>
37 #include <linux/init.h>
38 #include <linux/quotaops.h>
40 #include <asm/uaccess.h>
41 #include <asm/io.h>
42 #include <asm/bitops.h>
44 #define NR_SIZES 7
45 static char buffersize_index[65] =
46 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
47 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
48 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
50 6};
52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
56 number of unused buffer heads */
59 * Hash table mask..
61 static unsigned long bh_hash_mask = 0;
63 static int grow_buffers(int size);
65 static struct buffer_head ** hash_table;
66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
69 static kmem_cache_t *bh_cachep;
71 static struct buffer_head * unused_list = NULL;
72 static struct buffer_head * reuse_list = NULL;
73 static struct wait_queue * buffer_wait = NULL;
75 static int nr_buffers = 0;
76 static int nr_buffers_type[NR_LIST] = {0,};
77 static int nr_buffer_heads = 0;
78 static int nr_unused_buffer_heads = 0;
79 static int nr_hashed_buffers = 0;
81 /* This is used by some architectures to estimate available memory. */
82 int buffermem = 0;
84 /* Here is the parameter block for the bdflush process. If you add or
85 * remove any of the parameters, make sure to update kernel/sysctl.c.
88 #define N_PARAM 9
90 /* The dummy values in this structure are left in there for compatibility
91 * with old programs that play with the /proc entries.
93 union bdflush_param{
94 struct {
95 int nfract; /* Percentage of buffer cache dirty to
96 activate bdflush */
97 int ndirty; /* Maximum number of dirty blocks to write out per
98 wake-cycle */
99 int nrefill; /* Number of clean buffers to try to obtain
100 each time we call refill */
101 int nref_dirt; /* Dirty buffer threshold for activating bdflush
102 when trying to refill buffers. */
103 int interval; /* Interval (seconds) between spontaneous
104 bdflush runs */
105 int age_buffer; /* Time for normal buffer to age before
106 we flush it */
107 int age_super; /* Time for superblock to age before we
108 flush it */
109 int dummy2; /* unused */
110 int dummy3; /* unused */
111 } b_un;
112 unsigned int data[N_PARAM];
113 } bdf_prm = {{40, 500, 64, 256, 5, 30*HZ, 5*HZ, 1884, 2}};
115 /* These are the min and max parameter values that we will allow to be assigned */
116 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 1, 1*HZ, 1*HZ, 1, 1};
117 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
119 void wakeup_bdflush(int);
122 * Rewrote the wait-routines to use the "new" wait-queue functionality,
123 * and getting rid of the cli-sti pairs. The wait-queue routines still
124 * need cli-sti, but now it's just a couple of 386 instructions or so.
126 * Note that the real wait_on_buffer() is an inline function that checks
127 * if 'b_wait' is set before calling this, so that the queues aren't set
128 * up unnecessarily.
130 void __wait_on_buffer(struct buffer_head * bh)
132 struct task_struct *tsk = current;
133 struct wait_queue wait;
135 bh->b_count++;
136 wait.task = tsk;
137 add_wait_queue(&bh->b_wait, &wait);
138 repeat:
139 tsk->state = TASK_UNINTERRUPTIBLE;
140 run_task_queue(&tq_disk);
141 if (buffer_locked(bh)) {
142 schedule();
143 goto repeat;
145 tsk->state = TASK_RUNNING;
146 remove_wait_queue(&bh->b_wait, &wait);
147 bh->b_count--;
150 /* Call sync_buffers with wait!=0 to ensure that the call does not
151 * return until all buffer writes have completed. Sync() may return
152 * before the writes have finished; fsync() may not.
155 /* Godamity-damn. Some buffers (bitmaps for filesystems)
156 * spontaneously dirty themselves without ever brelse being called.
157 * We will ultimately want to put these in a separate list, but for
158 * now we search all of the lists for dirty buffers.
160 static int sync_buffers(kdev_t dev, int wait)
162 int i, retry, pass = 0, err = 0;
163 struct buffer_head * bh, *next;
165 /* One pass for no-wait, three for wait:
166 * 0) write out all dirty, unlocked buffers;
167 * 1) write out all dirty buffers, waiting if locked;
168 * 2) wait for completion by waiting for all buffers to unlock.
170 do {
171 retry = 0;
172 repeat:
173 /* We search all lists as a failsafe mechanism, not because we expect
174 * there to be dirty buffers on any of the other lists.
176 bh = lru_list[BUF_DIRTY];
177 if (!bh)
178 goto repeat2;
179 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
180 if (bh->b_list != BUF_DIRTY)
181 goto repeat;
182 next = bh->b_next_free;
183 if (!lru_list[BUF_DIRTY])
184 break;
185 if (dev && bh->b_dev != dev)
186 continue;
187 if (buffer_locked(bh)) {
188 /* Buffer is locked; skip it unless wait is
189 * requested AND pass > 0.
191 if (!wait || !pass) {
192 retry = 1;
193 continue;
195 wait_on_buffer (bh);
196 goto repeat;
199 /* If an unlocked buffer is not uptodate, there has
200 * been an IO error. Skip it.
202 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
203 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
204 err = -EIO;
205 continue;
208 /* Don't write clean buffers. Don't write ANY buffers
209 * on the third pass.
211 if (!buffer_dirty(bh) || pass >= 2)
212 continue;
214 /* Don't bother about locked buffers.
216 * XXX We checked if it was locked above and there is no
217 * XXX way we could have slept in between. -DaveM
219 if (buffer_locked(bh))
220 continue;
221 bh->b_count++;
222 next->b_count++;
223 bh->b_flushtime = 0;
224 ll_rw_block(WRITE, 1, &bh);
225 bh->b_count--;
226 next->b_count--;
227 retry = 1;
230 repeat2:
231 bh = lru_list[BUF_LOCKED];
232 if (!bh)
233 break;
234 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
235 if (bh->b_list != BUF_LOCKED)
236 goto repeat2;
237 next = bh->b_next_free;
238 if (!lru_list[BUF_LOCKED])
239 break;
240 if (dev && bh->b_dev != dev)
241 continue;
242 if (buffer_locked(bh)) {
243 /* Buffer is locked; skip it unless wait is
244 * requested AND pass > 0.
246 if (!wait || !pass) {
247 retry = 1;
248 continue;
250 wait_on_buffer (bh);
251 goto repeat2;
255 /* If we are waiting for the sync to succeed, and if any dirty
256 * blocks were written, then repeat; on the second pass, only
257 * wait for buffers being written (do not pass to write any
258 * more buffers on the second pass).
260 } while (wait && retry && ++pass<=2);
261 return err;
264 void sync_dev(kdev_t dev)
266 sync_buffers(dev, 0);
267 sync_supers(dev);
268 sync_inodes(dev);
269 sync_buffers(dev, 0);
270 DQUOT_SYNC(dev);
272 * FIXME(eric) we need to sync the physical devices here.
273 * This is because some (scsi) controllers have huge amounts of
274 * cache onboard (hundreds of Mb), and we need to instruct
275 * them to commit all of the dirty memory to disk, and we should
276 * not return until this has happened.
278 * This would need to get implemented by going through the assorted
279 * layers so that each block major number can be synced, and this
280 * would call down into the upper and mid-layer scsi.
284 int fsync_dev(kdev_t dev)
286 sync_buffers(dev, 0);
287 sync_supers(dev);
288 sync_inodes(dev);
289 DQUOT_SYNC(dev);
290 return sync_buffers(dev, 1);
293 asmlinkage int sys_sync(void)
295 lock_kernel();
296 fsync_dev(0);
297 unlock_kernel();
298 return 0;
302 * filp may be NULL if called via the msync of a vma.
305 int file_fsync(struct file *filp, struct dentry *dentry)
307 struct inode * inode = dentry->d_inode;
308 struct super_block * sb;
309 kdev_t dev;
311 /* sync the inode to buffers */
312 write_inode_now(inode);
314 /* sync the superblock to buffers */
315 sb = inode->i_sb;
316 wait_on_super(sb);
317 if (sb->s_op && sb->s_op->write_super)
318 sb->s_op->write_super(sb);
320 /* .. finally sync the buffers to disk */
321 dev = inode->i_dev;
322 return sync_buffers(dev, 1);
325 asmlinkage int sys_fsync(unsigned int fd)
327 struct file * file;
328 struct dentry * dentry;
329 struct inode * inode;
330 int err;
332 lock_kernel();
333 err = -EBADF;
334 file = fget(fd);
335 if (!file)
336 goto out;
338 dentry = file->f_dentry;
339 if (!dentry)
340 goto out_putf;
342 inode = dentry->d_inode;
343 if (!inode)
344 goto out_putf;
346 err = -EINVAL;
347 if (!file->f_op || !file->f_op->fsync)
348 goto out_putf;
350 /* We need to protect against concurrent writers.. */
351 down(&inode->i_sem);
352 err = file->f_op->fsync(file, dentry);
353 up(&inode->i_sem);
355 out_putf:
356 fput(file);
357 out:
358 unlock_kernel();
359 return err;
362 asmlinkage int sys_fdatasync(unsigned int fd)
364 struct file * file;
365 struct dentry * dentry;
366 struct inode * inode;
367 int err;
369 lock_kernel();
370 err = -EBADF;
371 file = fget(fd);
372 if (!file)
373 goto out;
375 dentry = file->f_dentry;
376 if (!dentry)
377 goto out_putf;
379 inode = dentry->d_inode;
380 if (!inode)
381 goto out_putf;
383 err = -EINVAL;
384 if (!file->f_op || !file->f_op->fsync)
385 goto out_putf;
387 /* this needs further work, at the moment it is identical to fsync() */
388 down(&inode->i_sem);
389 err = file->f_op->fsync(file, dentry);
390 up(&inode->i_sem);
392 out_putf:
393 fput(file);
394 out:
395 unlock_kernel();
396 return err;
399 void invalidate_buffers(kdev_t dev)
401 int i;
402 int nlist;
403 struct buffer_head * bh;
405 for(nlist = 0; nlist < NR_LIST; nlist++) {
406 bh = lru_list[nlist];
407 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
408 if (bh->b_dev != dev)
409 continue;
410 wait_on_buffer(bh);
411 if (bh->b_dev != dev)
412 continue;
413 if (bh->b_count)
414 continue;
415 bh->b_flushtime = 0;
416 clear_bit(BH_Protected, &bh->b_state);
417 clear_bit(BH_Uptodate, &bh->b_state);
418 clear_bit(BH_Dirty, &bh->b_state);
419 clear_bit(BH_Req, &bh->b_state);
424 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
425 #define hash(dev,block) hash_table[_hashfn(dev,block)]
427 static inline void remove_from_hash_queue(struct buffer_head * bh)
429 struct buffer_head **pprev = bh->b_pprev;
430 if (pprev) {
431 struct buffer_head * next = bh->b_next;
432 if (next) {
433 next->b_pprev = pprev;
434 bh->b_next = NULL;
436 *pprev = next;
437 bh->b_pprev = NULL;
439 nr_hashed_buffers--;
442 static inline void remove_from_lru_list(struct buffer_head * bh)
444 if (!(bh->b_prev_free) || !(bh->b_next_free))
445 panic("VFS: LRU block list corrupted");
446 if (bh->b_dev == B_FREE)
447 panic("LRU list corrupted");
448 bh->b_prev_free->b_next_free = bh->b_next_free;
449 bh->b_next_free->b_prev_free = bh->b_prev_free;
451 if (lru_list[bh->b_list] == bh)
452 lru_list[bh->b_list] = bh->b_next_free;
453 if (lru_list[bh->b_list] == bh)
454 lru_list[bh->b_list] = NULL;
455 bh->b_next_free = bh->b_prev_free = NULL;
458 static inline void remove_from_free_list(struct buffer_head * bh)
460 int isize = BUFSIZE_INDEX(bh->b_size);
461 if (!(bh->b_prev_free) || !(bh->b_next_free))
462 panic("VFS: Free block list corrupted");
463 if(bh->b_dev != B_FREE)
464 panic("Free list corrupted");
465 if(!free_list[isize])
466 panic("Free list empty");
467 if(bh->b_next_free == bh)
468 free_list[isize] = NULL;
469 else {
470 bh->b_prev_free->b_next_free = bh->b_next_free;
471 bh->b_next_free->b_prev_free = bh->b_prev_free;
472 if (free_list[isize] == bh)
473 free_list[isize] = bh->b_next_free;
475 bh->b_next_free = bh->b_prev_free = NULL;
478 static void remove_from_queues(struct buffer_head * bh)
480 if(bh->b_dev == B_FREE) {
481 remove_from_free_list(bh); /* Free list entries should not be
482 in the hash queue */
483 return;
485 nr_buffers_type[bh->b_list]--;
486 remove_from_hash_queue(bh);
487 remove_from_lru_list(bh);
490 static inline void put_last_free(struct buffer_head * bh)
492 if (bh) {
493 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
495 bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
497 /* Add to back of free list. */
498 if(!*bhp) {
499 *bhp = bh;
500 bh->b_prev_free = bh;
503 bh->b_next_free = *bhp;
504 bh->b_prev_free = (*bhp)->b_prev_free;
505 (*bhp)->b_prev_free->b_next_free = bh;
506 (*bhp)->b_prev_free = bh;
510 static void insert_into_queues(struct buffer_head * bh)
512 /* put at end of free list */
513 if(bh->b_dev == B_FREE) {
514 put_last_free(bh);
515 } else {
516 struct buffer_head **bhp = &lru_list[bh->b_list];
518 if(!*bhp) {
519 *bhp = bh;
520 bh->b_prev_free = bh;
523 if (bh->b_next_free)
524 panic("VFS: buffer LRU pointers corrupted");
526 bh->b_next_free = *bhp;
527 bh->b_prev_free = (*bhp)->b_prev_free;
528 (*bhp)->b_prev_free->b_next_free = bh;
529 (*bhp)->b_prev_free = bh;
531 nr_buffers_type[bh->b_list]++;
533 /* Put the buffer in new hash-queue if it has a device. */
534 bh->b_next = NULL;
535 bh->b_pprev = NULL;
536 if (bh->b_dev) {
537 struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
538 struct buffer_head *next = *bhp;
540 if (next) {
541 bh->b_next = next;
542 next->b_pprev = &bh->b_next;
544 *bhp = bh;
545 bh->b_pprev = bhp;
547 nr_hashed_buffers++;
551 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
553 struct buffer_head * next;
555 next = hash(dev,block);
556 for (;;) {
557 struct buffer_head *tmp = next;
558 if (!next)
559 break;
560 next = tmp->b_next;
561 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
562 continue;
563 next = tmp;
564 break;
566 return next;
570 * Why like this, I hear you say... The reason is race-conditions.
571 * As we don't lock buffers (unless we are reading them, that is),
572 * something might happen to it while we sleep (ie a read-error
573 * will force it bad). This shouldn't really happen currently, but
574 * the code is ready.
576 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
578 struct buffer_head * bh;
579 bh = find_buffer(dev,block,size);
580 if (bh)
581 bh->b_count++;
582 return bh;
585 unsigned int get_hardblocksize(kdev_t dev)
588 * Get the hard sector size for the given device. If we don't know
589 * what it is, return 0.
591 if (hardsect_size[MAJOR(dev)] != NULL) {
592 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
593 if (blksize != 0)
594 return blksize;
598 * We don't know what the hardware sector size for this device is.
599 * Return 0 indicating that we don't know.
601 return 0;
604 void set_blocksize(kdev_t dev, int size)
606 extern int *blksize_size[];
607 int i, nlist;
608 struct buffer_head * bh, *bhnext;
610 if (!blksize_size[MAJOR(dev)])
611 return;
613 /* Size must be a power of two, and between 512 and PAGE_SIZE */
614 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
615 panic("Invalid blocksize passed to set_blocksize");
617 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
618 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
619 return;
621 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
622 return;
623 sync_buffers(dev, 2);
624 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
626 /* We need to be quite careful how we do this - we are moving entries
627 * around on the free list, and we can get in a loop if we are not careful.
629 for(nlist = 0; nlist < NR_LIST; nlist++) {
630 bh = lru_list[nlist];
631 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
632 if(!bh)
633 break;
635 bhnext = bh->b_next_free;
636 if (bh->b_dev != dev)
637 continue;
638 if (bh->b_size == size)
639 continue;
640 bhnext->b_count++;
641 wait_on_buffer(bh);
642 bhnext->b_count--;
643 if (bh->b_dev == dev && bh->b_size != size) {
644 clear_bit(BH_Dirty, &bh->b_state);
645 clear_bit(BH_Uptodate, &bh->b_state);
646 clear_bit(BH_Req, &bh->b_state);
647 bh->b_flushtime = 0;
649 remove_from_hash_queue(bh);
655 * We used to try various strange things. Let's not.
657 static void refill_freelist(int size)
659 if (!grow_buffers(size)) {
660 wakeup_bdflush(1);
661 current->policy |= SCHED_YIELD;
662 schedule();
666 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
667 bh_end_io_t *handler, void *dev_id)
669 bh->b_count = 1;
670 bh->b_list = BUF_CLEAN;
671 bh->b_flushtime = 0;
672 bh->b_dev = dev;
673 bh->b_blocknr = block;
674 bh->b_end_io = handler;
675 bh->b_dev_id = dev_id;
678 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
680 mark_buffer_uptodate(bh, uptodate);
681 unlock_buffer(bh);
685 * Ok, this is getblk, and it isn't very clear, again to hinder
686 * race-conditions. Most of the code is seldom used, (ie repeating),
687 * so it should be much more efficient than it looks.
689 * The algorithm is changed: hopefully better, and an elusive bug removed.
691 * 14.02.92: changed it to sync dirty buffers a bit: better performance
692 * when the filesystem starts to get full of dirty blocks (I hope).
694 struct buffer_head * getblk(kdev_t dev, int block, int size)
696 struct buffer_head * bh;
697 int isize;
699 repeat:
700 bh = get_hash_table(dev, block, size);
701 if (bh) {
702 if (!buffer_dirty(bh)) {
703 bh->b_flushtime = 0;
705 return bh;
708 isize = BUFSIZE_INDEX(size);
709 get_free:
710 bh = free_list[isize];
711 if (!bh)
712 goto refill;
713 remove_from_free_list(bh);
715 /* OK, FINALLY we know that this buffer is the only one of its kind,
716 * and that it's unused (b_count=0), unlocked, and clean.
718 init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
719 bh->b_state=0;
720 insert_into_queues(bh);
721 return bh;
724 * If we block while refilling the free list, somebody may
725 * create the buffer first ... search the hashes again.
727 refill:
728 refill_freelist(size);
729 if (!find_buffer(dev,block,size))
730 goto get_free;
731 goto repeat;
734 void set_writetime(struct buffer_head * buf, int flag)
736 int newtime;
738 if (buffer_dirty(buf)) {
739 /* Move buffer to dirty list if jiffies is clear. */
740 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
741 bdf_prm.b_un.age_buffer);
742 if(!buf->b_flushtime || buf->b_flushtime > newtime)
743 buf->b_flushtime = newtime;
744 } else {
745 buf->b_flushtime = 0;
751 * Put a buffer into the appropriate list, without side-effects.
753 static inline void file_buffer(struct buffer_head *bh, int list)
755 remove_from_queues(bh);
756 bh->b_list = list;
757 insert_into_queues(bh);
761 * A buffer may need to be moved from one buffer list to another
762 * (e.g. in case it is not shared any more). Handle this.
764 void refile_buffer(struct buffer_head * buf)
766 int dispose;
768 if(buf->b_dev == B_FREE) {
769 printk("Attempt to refile free buffer\n");
770 return;
772 if (buffer_dirty(buf))
773 dispose = BUF_DIRTY;
774 else if (buffer_locked(buf))
775 dispose = BUF_LOCKED;
776 else
777 dispose = BUF_CLEAN;
778 if(dispose != buf->b_list) {
779 file_buffer(buf, dispose);
780 if(dispose == BUF_DIRTY) {
781 int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
783 /* This buffer is dirty, maybe we need to start flushing.
784 * If too high a percentage of the buffers are dirty...
786 if (nr_buffers_type[BUF_DIRTY] > too_many)
787 wakeup_bdflush(1);
789 /* If this is a loop device, and
790 * more than half of the buffers are dirty...
791 * (Prevents no-free-buffers deadlock with loop device.)
793 if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
794 nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
795 wakeup_bdflush(1);
801 * Release a buffer head
803 void __brelse(struct buffer_head * buf)
805 /* If dirty, mark the time this buffer should be written back. */
806 set_writetime(buf, 0);
807 refile_buffer(buf);
808 touch_buffer(buf);
810 if (buf->b_count) {
811 buf->b_count--;
812 return;
814 printk("VFS: brelse: Trying to free free buffer\n");
818 * bforget() is like brelse(), except it puts the buffer on the
819 * free list if it can.. We can NOT free the buffer if:
820 * - there are other users of it
821 * - it is locked and thus can have active IO
823 void __bforget(struct buffer_head * buf)
825 if (buf->b_count != 1 || buffer_locked(buf)) {
826 __brelse(buf);
827 return;
829 buf->b_count = 0;
830 buf->b_state = 0;
831 remove_from_queues(buf);
832 put_last_free(buf);
836 * bread() reads a specified block and returns the buffer that contains
837 * it. It returns NULL if the block was unreadable.
839 struct buffer_head * bread(kdev_t dev, int block, int size)
841 struct buffer_head * bh;
843 bh = getblk(dev, block, size);
844 if (buffer_uptodate(bh))
845 return bh;
846 ll_rw_block(READ, 1, &bh);
847 wait_on_buffer(bh);
848 if (buffer_uptodate(bh))
849 return bh;
850 brelse(bh);
851 return NULL;
855 * Ok, breada can be used as bread, but additionally to mark other
856 * blocks for reading as well. End the argument list with a negative
857 * number.
860 #define NBUF 16
862 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
863 unsigned int pos, unsigned int filesize)
865 struct buffer_head * bhlist[NBUF];
866 unsigned int blocks;
867 struct buffer_head * bh;
868 int index;
869 int i, j;
871 if (pos >= filesize)
872 return NULL;
874 if (block < 0)
875 return NULL;
877 bh = getblk(dev, block, bufsize);
878 index = BUFSIZE_INDEX(bh->b_size);
880 if (buffer_uptodate(bh))
881 return(bh);
882 else ll_rw_block(READ, 1, &bh);
884 blocks = (filesize - pos) >> (9+index);
886 if (blocks < (read_ahead[MAJOR(dev)] >> index))
887 blocks = read_ahead[MAJOR(dev)] >> index;
888 if (blocks > NBUF)
889 blocks = NBUF;
891 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
894 bhlist[0] = bh;
895 j = 1;
896 for(i=1; i<blocks; i++) {
897 bh = getblk(dev,block+i,bufsize);
898 if (buffer_uptodate(bh)) {
899 brelse(bh);
900 break;
902 else bhlist[j++] = bh;
905 /* Request the read for these buffers, and then release them. */
906 if (j>1)
907 ll_rw_block(READA, (j-1), bhlist+1);
908 for(i=1; i<j; i++)
909 brelse(bhlist[i]);
911 /* Wait for this buffer, and then continue on. */
912 bh = bhlist[0];
913 wait_on_buffer(bh);
914 if (buffer_uptodate(bh))
915 return bh;
916 brelse(bh);
917 return NULL;
921 * Note: the caller should wake up the buffer_wait list if needed.
923 static void put_unused_buffer_head(struct buffer_head * bh)
925 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
926 nr_buffer_heads--;
927 kmem_cache_free(bh_cachep, bh);
928 return;
931 memset(bh,0,sizeof(*bh));
932 nr_unused_buffer_heads++;
933 bh->b_next_free = unused_list;
934 unused_list = bh;
938 * We can't put completed temporary IO buffer_heads directly onto the
939 * unused_list when they become unlocked, since the device driver
940 * end_request routines still expect access to the buffer_head's
941 * fields after the final unlock. So, the device driver puts them on
942 * the reuse_list instead once IO completes, and we recover these to
943 * the unused_list here.
945 * Note that we don't do a wakeup here, but return a flag indicating
946 * whether we got any buffer heads. A task ready to sleep can check
947 * the returned value, and any tasks already sleeping will have been
948 * awakened when the buffer heads were added to the reuse list.
950 static inline int recover_reusable_buffer_heads(void)
952 struct buffer_head *head = xchg(&reuse_list, NULL);
953 int found = 0;
955 if (head) {
956 do {
957 struct buffer_head *bh = head;
958 head = head->b_next_free;
959 put_unused_buffer_head(bh);
960 } while (head);
961 found = 1;
963 return found;
967 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
968 * no-buffer-head deadlock. Return NULL on failure; waiting for
969 * buffer heads is now handled in create_buffers().
971 static struct buffer_head * get_unused_buffer_head(int async)
973 struct buffer_head * bh;
975 recover_reusable_buffer_heads();
976 if (nr_unused_buffer_heads > NR_RESERVED) {
977 bh = unused_list;
978 unused_list = bh->b_next_free;
979 nr_unused_buffer_heads--;
980 return bh;
983 /* This is critical. We can't swap out pages to get
984 * more buffer heads, because the swap-out may need
985 * more buffer-heads itself. Thus SLAB_BUFFER.
987 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
988 memset(bh, 0, sizeof(*bh));
989 nr_buffer_heads++;
990 return bh;
994 * If we need an async buffer, use the reserved buffer heads.
996 if (async && unused_list) {
997 bh = unused_list;
998 unused_list = bh->b_next_free;
999 nr_unused_buffer_heads--;
1000 return bh;
1003 #if 0
1005 * (Pending further analysis ...)
1006 * Ordinary (non-async) requests can use a different memory priority
1007 * to free up pages. Any swapping thus generated will use async
1008 * buffer heads.
1010 if(!async &&
1011 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1012 memset(bh, 0, sizeof(*bh));
1013 nr_buffer_heads++;
1014 return bh;
1016 #endif
1018 return NULL;
1022 * Create the appropriate buffers when given a page for data area and
1023 * the size of each buffer.. Use the bh->b_this_page linked list to
1024 * follow the buffers created. Return NULL if unable to create more
1025 * buffers.
1026 * The async flag is used to differentiate async IO (paging, swapping)
1027 * from ordinary buffer allocations, and only async requests are allowed
1028 * to sleep waiting for buffer heads.
1030 static struct buffer_head * create_buffers(unsigned long page,
1031 unsigned long size, int async)
1033 struct wait_queue wait = { current, NULL };
1034 struct buffer_head *bh, *head;
1035 long offset;
1037 try_again:
1038 head = NULL;
1039 offset = PAGE_SIZE;
1040 while ((offset -= size) >= 0) {
1041 bh = get_unused_buffer_head(async);
1042 if (!bh)
1043 goto no_grow;
1045 bh->b_dev = B_FREE; /* Flag as unused */
1046 bh->b_this_page = head;
1047 head = bh;
1049 bh->b_state = 0;
1050 bh->b_next_free = NULL;
1051 bh->b_count = 0;
1052 bh->b_size = size;
1054 bh->b_data = (char *) (page+offset);
1055 bh->b_list = 0;
1057 return head;
1059 * In case anything failed, we just free everything we got.
1061 no_grow:
1062 if (head) {
1063 do {
1064 bh = head;
1065 head = head->b_this_page;
1066 put_unused_buffer_head(bh);
1067 } while (head);
1069 /* Wake up any waiters ... */
1070 wake_up(&buffer_wait);
1074 * Return failure for non-async IO requests. Async IO requests
1075 * are not allowed to fail, so we have to wait until buffer heads
1076 * become available. But we don't want tasks sleeping with
1077 * partially complete buffers, so all were released above.
1079 if (!async)
1080 return NULL;
1082 /* We're _really_ low on memory. Now we just
1083 * wait for old buffer heads to become free due to
1084 * finishing IO. Since this is an async request and
1085 * the reserve list is empty, we're sure there are
1086 * async buffer heads in use.
1088 run_task_queue(&tq_disk);
1091 * Set our state for sleeping, then check again for buffer heads.
1092 * This ensures we won't miss a wake_up from an interrupt.
1094 add_wait_queue(&buffer_wait, &wait);
1095 current->state = TASK_UNINTERRUPTIBLE;
1096 if (!recover_reusable_buffer_heads())
1097 schedule();
1098 remove_wait_queue(&buffer_wait, &wait);
1099 current->state = TASK_RUNNING;
1100 goto try_again;
1103 /* Run the hooks that have to be done when a page I/O has completed. */
1104 static inline void after_unlock_page (struct page * page)
1106 if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1107 atomic_dec(&nr_async_pages);
1108 #ifdef DEBUG_SWAP
1109 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1110 (char *) page_address(page),
1111 atomic_read(&nr_async_pages));
1112 #endif
1114 if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1115 swap_after_unlock_page(page->offset);
1116 if (test_and_clear_bit(PG_free_after, &page->flags))
1117 __free_page(page);
1121 * Free all temporary buffers belonging to a page.
1122 * This needs to be called with interrupts disabled.
1124 static inline void free_async_buffers (struct buffer_head * bh)
1126 struct buffer_head *tmp, *tail;
1129 * Link all the buffers into the b_next_free list,
1130 * so we only have to do one xchg() operation ...
1132 tail = bh;
1133 while ((tmp = tail->b_this_page) != bh) {
1134 tail->b_next_free = tmp;
1135 tail = tmp;
1138 /* Update the reuse list */
1139 tail->b_next_free = xchg(&reuse_list, NULL);
1140 reuse_list = bh;
1142 /* Wake up any waiters ... */
1143 wake_up(&buffer_wait);
1146 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1148 unsigned long flags;
1149 struct buffer_head *tmp;
1150 struct page *page;
1152 mark_buffer_uptodate(bh, uptodate);
1153 unlock_buffer(bh);
1155 /* This is a temporary buffer used for page I/O. */
1156 page = mem_map + MAP_NR(bh->b_data);
1157 if (!PageLocked(page))
1158 goto not_locked;
1159 if (bh->b_count != 1)
1160 goto bad_count;
1162 if (!test_bit(BH_Uptodate, &bh->b_state))
1163 set_bit(PG_error, &page->flags);
1166 * Be _very_ careful from here on. Bad things can happen if
1167 * two buffer heads end IO at almost the same time and both
1168 * decide that the page is now completely done.
1170 * Async buffer_heads are here only as labels for IO, and get
1171 * thrown away once the IO for this page is complete. IO is
1172 * deemed complete once all buffers have been visited
1173 * (b_count==0) and are now unlocked. We must make sure that
1174 * only the _last_ buffer that decrements its count is the one
1175 * that free's the page..
1177 save_flags(flags);
1178 cli();
1179 bh->b_count--;
1180 tmp = bh;
1181 do {
1182 if (tmp->b_count)
1183 goto still_busy;
1184 tmp = tmp->b_this_page;
1185 } while (tmp != bh);
1187 /* OK, the async IO on this page is complete. */
1188 free_async_buffers(bh);
1189 restore_flags(flags);
1190 clear_bit(PG_locked, &page->flags);
1191 wake_up(&page->wait);
1192 after_unlock_page(page);
1193 return;
1195 still_busy:
1196 restore_flags(flags);
1197 return;
1199 not_locked:
1200 printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1201 return;
1203 bad_count:
1204 printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1205 return;
1209 * Start I/O on a page.
1210 * This function expects the page to be locked and may return before I/O is complete.
1211 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1213 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1215 struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1216 int block, nr;
1218 if (!PageLocked(page))
1219 panic("brw_page: page not locked for I/O");
1220 clear_bit(PG_uptodate, &page->flags);
1221 clear_bit(PG_error, &page->flags);
1223 * Allocate async buffer heads pointing to this page, just for I/O.
1224 * They do _not_ show up in the buffer hash table!
1225 * They are _not_ registered in page->buffers either!
1227 bh = create_buffers(page_address(page), size, 1);
1228 if (!bh) {
1229 /* WSH: exit here leaves page->count incremented */
1230 clear_bit(PG_locked, &page->flags);
1231 wake_up(&page->wait);
1232 return -ENOMEM;
1234 nr = 0;
1235 next = bh;
1236 do {
1237 struct buffer_head * tmp;
1238 block = *(b++);
1240 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1241 set_bit(BH_Uptodate, &next->b_state);
1244 * When we use bmap, we define block zero to represent
1245 * a hole. ll_rw_page, however, may legitimately
1246 * access block zero, and we need to distinguish the
1247 * two cases.
1249 if (bmap && !block) {
1250 memset(next->b_data, 0, size);
1251 next->b_count--;
1252 continue;
1254 tmp = get_hash_table(dev, block, size);
1255 if (tmp) {
1256 if (!buffer_uptodate(tmp)) {
1257 if (rw == READ)
1258 ll_rw_block(READ, 1, &tmp);
1259 wait_on_buffer(tmp);
1261 if (rw == READ)
1262 memcpy(next->b_data, tmp->b_data, size);
1263 else {
1264 memcpy(tmp->b_data, next->b_data, size);
1265 mark_buffer_dirty(tmp, 0);
1267 brelse(tmp);
1268 next->b_count--;
1269 continue;
1271 if (rw == READ)
1272 clear_bit(BH_Uptodate, &next->b_state);
1273 else
1274 set_bit(BH_Dirty, &next->b_state);
1275 arr[nr++] = next;
1276 } while (prev = next, (next = next->b_this_page) != NULL);
1277 prev->b_this_page = bh;
1279 if (nr) {
1280 ll_rw_block(rw, nr, arr);
1281 /* The rest of the work is done in mark_buffer_uptodate()
1282 * and unlock_buffer(). */
1283 } else {
1284 unsigned long flags;
1285 clear_bit(PG_locked, &page->flags);
1286 set_bit(PG_uptodate, &page->flags);
1287 wake_up(&page->wait);
1288 save_flags(flags);
1289 cli();
1290 free_async_buffers(bh);
1291 restore_flags(flags);
1292 after_unlock_page(page);
1294 ++current->maj_flt;
1295 return 0;
1299 * This is called by end_request() when I/O has completed.
1301 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1303 if (on) {
1304 struct buffer_head *tmp = bh;
1305 set_bit(BH_Uptodate, &bh->b_state);
1306 /* If a page has buffers and all these buffers are uptodate,
1307 * then the page is uptodate. */
1308 do {
1309 if (!test_bit(BH_Uptodate, &tmp->b_state))
1310 return;
1311 tmp=tmp->b_this_page;
1312 } while (tmp && tmp != bh);
1313 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1314 return;
1316 clear_bit(BH_Uptodate, &bh->b_state);
1320 * Generic "readpage" function for block devices that have the normal
1321 * bmap functionality. This is most of the block device filesystems.
1322 * Reads the page asynchronously --- the unlock_buffer() and
1323 * mark_buffer_uptodate() functions propagate buffer state into the
1324 * page struct once IO has completed.
1326 int generic_readpage(struct file * file, struct page * page)
1328 struct dentry *dentry = file->f_dentry;
1329 struct inode *inode = dentry->d_inode;
1330 unsigned long block;
1331 int *p, nr[PAGE_SIZE/512];
1332 int i;
1334 atomic_inc(&page->count);
1335 set_bit(PG_locked, &page->flags);
1336 set_bit(PG_free_after, &page->flags);
1338 i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1339 block = page->offset >> inode->i_sb->s_blocksize_bits;
1340 p = nr;
1341 do {
1342 *p = inode->i_op->bmap(inode, block);
1343 i--;
1344 block++;
1345 p++;
1346 } while (i > 0);
1348 /* IO start */
1349 brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1350 return 0;
1354 * Try to increase the number of buffers available: the size argument
1355 * is used to determine what kind of buffers we want.
1357 static int grow_buffers(int size)
1359 unsigned long page;
1360 struct buffer_head *bh, *tmp;
1361 struct buffer_head * insert_point;
1362 int isize;
1364 if ((size & 511) || (size > PAGE_SIZE)) {
1365 printk("VFS: grow_buffers: size = %d\n",size);
1366 return 0;
1369 if (!(page = __get_free_page(GFP_BUFFER)))
1370 return 0;
1371 bh = create_buffers(page, size, 0);
1372 if (!bh) {
1373 free_page(page);
1374 return 0;
1377 isize = BUFSIZE_INDEX(size);
1378 insert_point = free_list[isize];
1380 tmp = bh;
1381 while (1) {
1382 if (insert_point) {
1383 tmp->b_next_free = insert_point->b_next_free;
1384 tmp->b_prev_free = insert_point;
1385 insert_point->b_next_free->b_prev_free = tmp;
1386 insert_point->b_next_free = tmp;
1387 } else {
1388 tmp->b_prev_free = tmp;
1389 tmp->b_next_free = tmp;
1391 insert_point = tmp;
1392 ++nr_buffers;
1393 if (tmp->b_this_page)
1394 tmp = tmp->b_this_page;
1395 else
1396 break;
1398 tmp->b_this_page = bh;
1399 free_list[isize] = bh;
1400 mem_map[MAP_NR(page)].buffers = bh;
1401 buffermem += PAGE_SIZE;
1402 return 1;
1406 * Can the buffer be thrown out?
1408 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1409 #define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1412 * try_to_free_buffers() checks if all the buffers on this particular page
1413 * are unused, and free's the page if so.
1415 * Wake up bdflush() if this fails - if we're running low on memory due
1416 * to dirty buffers, we need to flush them out as quickly as possible.
1418 int try_to_free_buffers(struct page * page_map)
1420 struct buffer_head * tmp, * bh = page_map->buffers;
1422 tmp = bh;
1423 do {
1424 struct buffer_head * p = tmp;
1426 tmp = tmp->b_this_page;
1427 if (!buffer_busy(p))
1428 continue;
1430 wakeup_bdflush(0);
1431 return 0;
1432 } while (tmp != bh);
1434 tmp = bh;
1435 do {
1436 struct buffer_head * p = tmp;
1437 tmp = tmp->b_this_page;
1438 nr_buffers--;
1439 remove_from_queues(p);
1440 put_unused_buffer_head(p);
1441 } while (tmp != bh);
1443 /* Wake up anyone waiting for buffer heads */
1444 wake_up(&buffer_wait);
1446 /* And free the page */
1447 buffermem -= PAGE_SIZE;
1448 page_map->buffers = NULL;
1449 __free_page(page_map);
1450 return 1;
1453 /* ================== Debugging =================== */
1455 void show_buffers(void)
1457 struct buffer_head * bh;
1458 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1459 int protected = 0;
1460 int nlist;
1461 static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1463 printk("Buffer memory: %6dkB\n",buffermem>>10);
1464 printk("Buffer heads: %6d\n",nr_buffer_heads);
1465 printk("Buffer blocks: %6d\n",nr_buffers);
1466 printk("Buffer hashed: %6d\n",nr_hashed_buffers);
1468 for(nlist = 0; nlist < NR_LIST; nlist++) {
1469 found = locked = dirty = used = lastused = protected = 0;
1470 bh = lru_list[nlist];
1471 if(!bh) continue;
1473 do {
1474 found++;
1475 if (buffer_locked(bh))
1476 locked++;
1477 if (buffer_protected(bh))
1478 protected++;
1479 if (buffer_dirty(bh))
1480 dirty++;
1481 if (bh->b_count)
1482 used++, lastused = found;
1483 bh = bh->b_next_free;
1484 } while (bh != lru_list[nlist]);
1485 printk("%8s: %d buffers, %d used (last=%d), "
1486 "%d locked, %d protected, %d dirty\n",
1487 buf_types[nlist], found, used, lastused,
1488 locked, protected, dirty);
1493 /* ===================== Init ======================= */
1496 * allocate the hash table and init the free list
1497 * Use gfp() for the hash table to decrease TLB misses, use
1498 * SLAB cache for buffer heads.
1500 void __init buffer_init(unsigned long memory_size)
1502 int order;
1503 unsigned int nr_hash;
1505 /* we need to guess at the right sort of size for a buffer cache.
1506 the heuristic from working with large databases and getting
1507 fsync times (ext2) manageable, is the following */
1509 memory_size >>= 20;
1510 for (order = 5; (1UL << order) < memory_size; order++);
1512 /* try to allocate something until we get it or we're asking
1513 for something that is really too small */
1515 do {
1516 nr_hash = (1UL << order) * PAGE_SIZE /
1517 sizeof(struct buffer_head *);
1518 hash_table = (struct buffer_head **)
1519 __get_free_pages(GFP_ATOMIC, order);
1520 } while (hash_table == NULL && --order > 4);
1522 if (!hash_table)
1523 panic("Failed to allocate buffer hash table\n");
1524 memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1525 bh_hash_mask = nr_hash-1;
1527 bh_cachep = kmem_cache_create("buffer_head",
1528 sizeof(struct buffer_head),
1530 SLAB_HWCACHE_ALIGN, NULL, NULL);
1531 if(!bh_cachep)
1532 panic("Cannot create buffer head SLAB cache\n");
1534 * Allocate the reserved buffer heads.
1536 while (nr_buffer_heads < NR_RESERVED) {
1537 struct buffer_head * bh;
1539 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1540 if (!bh)
1541 break;
1542 put_unused_buffer_head(bh);
1543 nr_buffer_heads++;
1546 lru_list[BUF_CLEAN] = 0;
1547 grow_buffers(BLOCK_SIZE);
1551 /* ====================== bdflush support =================== */
1553 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1554 * response to dirty buffers. Once this process is activated, we write back
1555 * a limited number of buffers to the disks and then go back to sleep again.
1557 static struct wait_queue * bdflush_done = NULL;
1558 struct task_struct *bdflush_tsk = 0;
1560 void wakeup_bdflush(int wait)
1562 if (current == bdflush_tsk)
1563 return;
1564 wake_up_process(bdflush_tsk);
1565 if (wait) {
1566 run_task_queue(&tq_disk);
1567 sleep_on(&bdflush_done);
1573 * Here we attempt to write back old buffers.
1574 * To prevent deadlocks for a loop device:
1575 * 1) Do non-blocking writes to loop (avoids deadlock with running
1576 * out of request blocks).
1577 * 2) But do a blocking write if the only dirty buffers are loop buffers
1578 * (otherwise we go into an infinite busy-loop).
1579 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1580 * with running out of free buffers for loop's "real" device).
1583 static inline void sync_old_buffers(void)
1585 int i;
1586 int ndirty = 0;
1587 int wrta_cmd = WRITEA;
1588 #ifdef DEBUG
1589 int ncount = 0, nwritten = 0;
1590 #endif
1591 struct buffer_head * bh, *next;
1593 #ifdef DEBUG
1594 bh = lru_list[BUF_CLEAN];
1595 if(bh)
1596 for(i = nr_buffers_type[BUF_CLEAN]; --i > 0; bh = next) {
1597 next = bh->b_next_free;
1599 /* Dirty/locked buffer on clean list? Refile it */
1600 if (buffer_locked(bh) || buffer_dirty(bh)) {
1601 ncount++;
1602 refile_buffer(bh);
1605 #endif
1607 bh = lru_list[BUF_LOCKED];
1608 if(bh)
1609 for(i = nr_buffers_type[BUF_LOCKED]; --i > 0; bh = next) {
1610 next = bh->b_next_free;
1612 /* Unlocked buffer on locked list? Refile it */
1613 if (!buffer_locked(bh))
1614 refile_buffer(bh);
1617 restart:
1618 bh = lru_list[BUF_DIRTY];
1619 if(bh)
1620 for (i = nr_buffers_type[BUF_DIRTY];
1621 i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1622 bh = next) {
1623 /* We may have stalled while waiting for
1624 I/O to complete. */
1625 if(bh->b_list != BUF_DIRTY)
1626 goto restart;
1627 next = bh->b_next_free;
1628 if(!lru_list[BUF_DIRTY]) {
1629 printk("Dirty list empty %d\n", i);
1630 break;
1633 /* Clean buffer on dirty list? Refile it */
1634 if (!buffer_dirty(bh)) {
1635 refile_buffer(bh);
1636 continue;
1639 if (buffer_locked(bh))
1640 continue;
1641 /* Should we write back buffers that are
1642 shared or not?? Currently dirty buffers
1643 are not shared, so it does not matter */
1644 next->b_count++;
1645 bh->b_count++;
1646 ndirty++;
1647 bh->b_flushtime = 0;
1648 if (MAJOR(bh->b_dev) == LOOP_MAJOR) {
1649 ll_rw_block(wrta_cmd,1, &bh);
1650 wrta_cmd = WRITEA;
1651 if (buffer_dirty(bh))
1652 --ndirty;
1654 else
1655 ll_rw_block(WRITE, 1, &bh);
1656 bh->b_count--;
1657 next->b_count--;
1659 /* If we didn't write anything, but there are still
1660 * dirty buffers, then make the next write to a
1661 * loop device to be a blocking write.
1662 * This lets us block--which we _must_ do! */
1663 if (ndirty == 0
1664 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1665 wrta_cmd = WRITE;
1666 goto restart;
1669 #ifdef DEBUG
1670 if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1671 printk("wrote %d/%d buffers...", nwritten, ndirty);
1672 #endif
1673 run_task_queue(&tq_disk);
1677 /* This is the interface to bdflush. As we get more sophisticated, we can
1678 * pass tuning parameters to this "process", to adjust how it behaves.
1679 * We would want to verify each parameter, however, to make sure that it
1680 * is reasonable. */
1682 asmlinkage int sys_bdflush(int func, long data)
1684 int i, error = -EPERM;
1686 lock_kernel();
1687 if (!capable(CAP_SYS_ADMIN))
1688 goto out;
1690 if (func == 1)
1691 /* Func 1 used to call sync_old_buffers; a user space
1692 daemon would call it periodically. This is no
1693 longer necessary. Returning -EPERM here makes the
1694 daemon silently exit. */
1695 goto out;
1697 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1698 if (func >= 2) {
1699 i = (func-2) >> 1;
1700 error = -EINVAL;
1701 if (i < 0 || i >= N_PARAM)
1702 goto out;
1703 if((func & 1) == 0) {
1704 error = put_user(bdf_prm.data[i], (int*)data);
1705 goto out;
1707 if (data < bdflush_min[i] || data > bdflush_max[i])
1708 goto out;
1709 bdf_prm.data[i] = data;
1710 error = 0;
1711 goto out;
1714 /* Having func 0 used to launch the actual bdflush and then never
1715 * return (unless explicitly killed). We return zero here to
1716 * remain semi-compatible with present update(8) programs.
1718 error = 0;
1719 out:
1720 unlock_kernel();
1721 return error;
1724 /* This is the actual bdflush daemon itself. It used to be started
1725 * from the syscall above, but now we launch it ourselves internally
1726 * with kernel_thread(...) directly after the first thread in
1727 * init/main.c. Every so often, or when woken up by another task that
1728 * needs memory, we call sync_old_buffers to partially clear the dirty list.
1731 int bdflush(void * unused)
1733 long remaining = HZ * bdf_prm.b_un.interval;
1734 struct task_struct *tsk = current;
1737 * We have a bare-bones task_struct, and really should fill
1738 * in a few more things so "top" and /proc/2/{exe,root,cwd}
1739 * display semi-sane things. Not real crucial though...
1742 tsk->session = 1;
1743 tsk->pgrp = 1;
1744 tsk->dumpable = 0; /* inhibit ptrace() */
1745 strcpy(tsk->comm, "kflushd");
1746 sigfillset(&tsk->blocked);
1747 bdflush_tsk = tsk;
1750 * As a kernel thread we want to tamper with system buffers
1751 * and other internals and thus be subject to the SMP locking
1752 * rules. (On a uniprocessor box this does nothing).
1754 lock_kernel();
1756 for (;;) {
1757 tsk->state = TASK_INTERRUPTIBLE;
1758 remaining = schedule_timeout(remaining);
1760 #ifdef DEBUG
1761 printk("bdflush() activated...");
1762 #endif
1763 CHECK_EMERGENCY_SYNC
1765 if (remaining == 0) {
1767 * Also try to flush inodes and supers, since
1768 * otherwise there would be no way of ensuring
1769 * that these quantities ever get written
1770 * back. Ideally, we would have a timestamp
1771 * on the inodes and superblocks so that we
1772 * could write back only the old ones.
1774 sync_supers(0);
1775 sync_inodes(0);
1776 remaining = HZ * bdf_prm.b_un.interval;
1779 /* Keep flushing till there aren't very many dirty buffers */
1780 do {
1781 sync_old_buffers();
1782 } while(nr_buffers_type[BUF_DIRTY] > nr_buffers * bdf_prm.b_un.nfract/100);
1784 wake_up(&bdflush_done);
1785 #ifdef DEBUG
1786 printk("sleeping again.\n");
1787 #endif