Linux 2.1.89-4
[davej-history.git] / fs / buffer.c
blob07157f6a315a322a4f17799689f006ad04b99fc0
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Some bdflush() changes for the dynamic ramdisk - Paul Gortmaker, 12/94 */
14 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
16 /* Removed a lot of unnecessary code and simplified things now that
17 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
20 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
21 * hash table, use SLAB cache for buffer heads. -DaveM
24 #include <linux/sched.h>
25 #include <linux/kernel.h>
26 #include <linux/major.h>
27 #include <linux/string.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/malloc.h>
31 #include <linux/slab.h>
32 #include <linux/pagemap.h>
33 #include <linux/swap.h>
34 #include <linux/swapctl.h>
35 #include <linux/smp.h>
36 #include <linux/smp_lock.h>
37 #include <linux/vmalloc.h>
38 #include <linux/blkdev.h>
39 #include <linux/sysrq.h>
40 #include <linux/file.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
44 #include <asm/io.h>
45 #include <asm/bitops.h>
47 #define NR_SIZES 5
48 static char buffersize_index[17] =
49 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4};
51 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
52 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
53 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
54 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
55 number of unused buffer heads */
58 * How large a hash table do we need?
60 #define HASH_PAGES_ORDER 4
61 #define HASH_PAGES (1UL << HASH_PAGES_ORDER)
62 #define NR_HASH (HASH_PAGES*PAGE_SIZE/sizeof(struct buffer_head *))
63 #define HASH_MASK (NR_HASH-1)
65 static int grow_buffers(int pri, int size);
67 static struct buffer_head ** hash_table;
68 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
69 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
71 static kmem_cache_t *bh_cachep;
73 static struct buffer_head * unused_list = NULL;
74 static struct buffer_head * reuse_list = NULL;
75 static struct wait_queue * buffer_wait = NULL;
77 static int nr_buffers = 0;
78 static int nr_buffers_type[NR_LIST] = {0,};
79 static int nr_buffer_heads = 0;
80 static int nr_unused_buffer_heads = 0;
81 static int refilled = 0; /* Set NZ when a buffer freelist is refilled
82 this is used by the loop device */
84 /* This is used by some architectures to estimate available memory. */
85 int buffermem = 0;
87 /* Here is the parameter block for the bdflush process. If you add or
88 * remove any of the parameters, make sure to update kernel/sysctl.c.
91 #define N_PARAM 9
93 /* The dummy values in this structure are left in there for compatibility
94 * with old programs that play with the /proc entries.
96 union bdflush_param{
97 struct {
98 int nfract; /* Percentage of buffer cache dirty to
99 activate bdflush */
100 int ndirty; /* Maximum number of dirty blocks to write out per
101 wake-cycle */
102 int nrefill; /* Number of clean buffers to try to obtain
103 each time we call refill */
104 int nref_dirt; /* Dirty buffer threshold for activating bdflush
105 when trying to refill buffers. */
106 int dummy1; /* unused */
107 int age_buffer; /* Time for normal buffer to age before
108 we flush it */
109 int age_super; /* Time for superblock to age before we
110 flush it */
111 int dummy2; /* unused */
112 int dummy3; /* unused */
113 } b_un;
114 unsigned int data[N_PARAM];
115 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
117 /* These are the min and max parameter values that we will allow to be assigned */
118 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 100, 100, 1, 1};
119 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 60000, 60000, 2047, 5};
121 void wakeup_bdflush(int);
124 * Rewrote the wait-routines to use the "new" wait-queue functionality,
125 * and getting rid of the cli-sti pairs. The wait-queue routines still
126 * need cli-sti, but now it's just a couple of 386 instructions or so.
128 * Note that the real wait_on_buffer() is an inline function that checks
129 * if 'b_wait' is set before calling this, so that the queues aren't set
130 * up unnecessarily.
132 void __wait_on_buffer(struct buffer_head * bh)
134 struct task_struct *tsk = current;
135 struct wait_queue wait;
137 bh->b_count++;
138 wait.task = tsk;
139 add_wait_queue(&bh->b_wait, &wait);
140 repeat:
141 tsk->state = TASK_UNINTERRUPTIBLE;
142 run_task_queue(&tq_disk);
143 if (buffer_locked(bh)) {
144 schedule();
145 goto repeat;
147 tsk->state = TASK_RUNNING;
148 remove_wait_queue(&bh->b_wait, &wait);
149 bh->b_count--;
152 /* Call sync_buffers with wait!=0 to ensure that the call does not
153 * return until all buffer writes have completed. Sync() may return
154 * before the writes have finished; fsync() may not.
157 /* Godamity-damn. Some buffers (bitmaps for filesystems)
158 * spontaneously dirty themselves without ever brelse being called.
159 * We will ultimately want to put these in a separate list, but for
160 * now we search all of the lists for dirty buffers.
162 static int sync_buffers(kdev_t dev, int wait)
164 int i, retry, pass = 0, err = 0;
165 struct buffer_head * bh, *next;
167 /* One pass for no-wait, three for wait:
168 * 0) write out all dirty, unlocked buffers;
169 * 1) write out all dirty buffers, waiting if locked;
170 * 2) wait for completion by waiting for all buffers to unlock.
172 do {
173 retry = 0;
174 repeat:
175 /* We search all lists as a failsafe mechanism, not because we expect
176 * there to be dirty buffers on any of the other lists.
178 bh = lru_list[BUF_DIRTY];
179 if (!bh)
180 goto repeat2;
181 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
182 if (bh->b_list != BUF_DIRTY)
183 goto repeat;
184 next = bh->b_next_free;
185 if (!lru_list[BUF_DIRTY])
186 break;
187 if (dev && bh->b_dev != dev)
188 continue;
189 if (buffer_locked(bh)) {
190 /* Buffer is locked; skip it unless wait is
191 * requested AND pass > 0.
193 if (!wait || !pass) {
194 retry = 1;
195 continue;
197 wait_on_buffer (bh);
198 goto repeat;
201 /* If an unlocked buffer is not uptodate, there has
202 * been an IO error. Skip it.
204 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
205 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
206 err = -EIO;
207 continue;
210 /* Don't write clean buffers. Don't write ANY buffers
211 * on the third pass.
213 if (!buffer_dirty(bh) || pass >= 2)
214 continue;
216 /* Don't bother about locked buffers.
218 * XXX We checked if it was locked above and there is no
219 * XXX way we could have slept in between. -DaveM
221 if (buffer_locked(bh))
222 continue;
223 bh->b_count++;
224 next->b_count++;
225 bh->b_flushtime = 0;
226 ll_rw_block(WRITE, 1, &bh);
227 bh->b_count--;
228 next->b_count--;
229 retry = 1;
232 repeat2:
233 bh = lru_list[BUF_LOCKED];
234 if (!bh)
235 break;
236 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
237 if (bh->b_list != BUF_LOCKED)
238 goto repeat2;
239 next = bh->b_next_free;
240 if (!lru_list[BUF_LOCKED])
241 break;
242 if (dev && bh->b_dev != dev)
243 continue;
244 if (buffer_locked(bh)) {
245 /* Buffer is locked; skip it unless wait is
246 * requested AND pass > 0.
248 if (!wait || !pass) {
249 retry = 1;
250 continue;
252 wait_on_buffer (bh);
253 goto repeat2;
257 /* If we are waiting for the sync to succeed, and if any dirty
258 * blocks were written, then repeat; on the second pass, only
259 * wait for buffers being written (do not pass to write any
260 * more buffers on the second pass).
262 } while (wait && retry && ++pass<=2);
263 return err;
266 void sync_dev(kdev_t dev)
268 sync_buffers(dev, 0);
269 sync_supers(dev);
270 sync_inodes(dev);
271 sync_buffers(dev, 0);
272 sync_dquots(dev, -1);
274 * FIXME(eric) we need to sync the physical devices here.
275 * This is because some (scsi) controllers have huge amounts of
276 * cache onboard (hundreds of Mb), and we need to instruct
277 * them to commit all of the dirty memory to disk, and we should
278 * not return until this has happened.
280 * This would need to get implemented by going through the assorted
281 * layers so that each block major number can be synced, and this
282 * would call down into the upper and mid-layer scsi.
286 int fsync_dev(kdev_t dev)
288 sync_buffers(dev, 0);
289 sync_supers(dev);
290 sync_inodes(dev);
291 sync_dquots(dev, -1);
292 return sync_buffers(dev, 1);
295 asmlinkage int sys_sync(void)
297 lock_kernel();
298 fsync_dev(0);
299 unlock_kernel();
300 return 0;
304 * filp may be NULL if called via the msync of a vma.
307 int file_fsync(struct file *filp, struct dentry *dentry)
309 struct inode * inode = dentry->d_inode;
310 struct super_block * sb;
311 kdev_t dev;
313 /* sync the inode to buffers */
314 write_inode_now(inode);
316 /* sync the superblock to buffers */
317 sb = inode->i_sb;
318 wait_on_super(sb);
319 if (sb->s_op && sb->s_op->write_super)
320 sb->s_op->write_super(sb);
322 /* .. finally sync the buffers to disk */
323 dev = inode->i_dev;
324 return sync_buffers(dev, 1);
327 asmlinkage int sys_fsync(unsigned int fd)
329 struct file * file;
330 struct dentry * dentry;
331 struct inode * inode;
332 int err;
334 lock_kernel();
335 err = -EBADF;
336 file = fget(fd);
337 if (!file)
338 goto out;
340 dentry = file->f_dentry;
341 if (!dentry)
342 goto out_putf;
344 inode = dentry->d_inode;
345 if (!inode)
346 goto out_putf;
348 err = -EINVAL;
349 if (!file->f_op || !file->f_op->fsync)
350 goto out_putf;
352 /* We need to protect against concurrent writers.. */
353 down(&inode->i_sem);
354 err = file->f_op->fsync(file, dentry);
355 up(&inode->i_sem);
357 out_putf:
358 fput(file);
359 out:
360 unlock_kernel();
361 return err;
364 asmlinkage int sys_fdatasync(unsigned int fd)
366 struct file * file;
367 struct dentry * dentry;
368 struct inode * inode;
369 int err;
371 lock_kernel();
372 err = -EBADF;
373 file = fget(fd);
374 if (!file)
375 goto out;
377 dentry = file->f_dentry;
378 if (!dentry)
379 goto out_putf;
381 inode = dentry->d_inode;
382 if (!inode)
383 goto out_putf;
385 err = -EINVAL;
386 if (!file->f_op || !file->f_op->fsync)
387 goto out_putf;
389 /* this needs further work, at the moment it is identical to fsync() */
390 err = file->f_op->fsync(file, dentry);
392 out_putf:
393 fput(file);
394 out:
395 unlock_kernel();
396 return err;
399 void invalidate_buffers(kdev_t dev)
401 int i;
402 int nlist;
403 struct buffer_head * bh;
405 for(nlist = 0; nlist < NR_LIST; nlist++) {
406 bh = lru_list[nlist];
407 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
408 if (bh->b_dev != dev)
409 continue;
410 wait_on_buffer(bh);
411 if (bh->b_dev != dev)
412 continue;
413 if (bh->b_count)
414 continue;
415 bh->b_flushtime = 0;
416 clear_bit(BH_Protected, &bh->b_state);
417 clear_bit(BH_Uptodate, &bh->b_state);
418 clear_bit(BH_Dirty, &bh->b_state);
419 clear_bit(BH_Req, &bh->b_state);
424 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block))&HASH_MASK)
425 #define hash(dev,block) hash_table[_hashfn(dev,block)]
427 static inline void remove_from_hash_queue(struct buffer_head * bh)
429 if (bh->b_pprev) {
430 if(bh->b_next)
431 bh->b_next->b_pprev = bh->b_pprev;
432 *bh->b_pprev = bh->b_next;
433 bh->b_pprev = NULL;
437 static inline void remove_from_lru_list(struct buffer_head * bh)
439 if (!(bh->b_prev_free) || !(bh->b_next_free))
440 panic("VFS: LRU block list corrupted");
441 if (bh->b_dev == B_FREE)
442 panic("LRU list corrupted");
443 bh->b_prev_free->b_next_free = bh->b_next_free;
444 bh->b_next_free->b_prev_free = bh->b_prev_free;
446 if (lru_list[bh->b_list] == bh)
447 lru_list[bh->b_list] = bh->b_next_free;
448 if (lru_list[bh->b_list] == bh)
449 lru_list[bh->b_list] = NULL;
450 bh->b_next_free = bh->b_prev_free = NULL;
453 static inline void remove_from_free_list(struct buffer_head * bh)
455 int isize = BUFSIZE_INDEX(bh->b_size);
456 if (!(bh->b_prev_free) || !(bh->b_next_free))
457 panic("VFS: Free block list corrupted");
458 if(bh->b_dev != B_FREE)
459 panic("Free list corrupted");
460 if(!free_list[isize])
461 panic("Free list empty");
462 if(bh->b_next_free == bh)
463 free_list[isize] = NULL;
464 else {
465 bh->b_prev_free->b_next_free = bh->b_next_free;
466 bh->b_next_free->b_prev_free = bh->b_prev_free;
467 if (free_list[isize] == bh)
468 free_list[isize] = bh->b_next_free;
470 bh->b_next_free = bh->b_prev_free = NULL;
473 static inline void remove_from_queues(struct buffer_head * bh)
475 if(bh->b_dev == B_FREE) {
476 remove_from_free_list(bh); /* Free list entries should not be
477 in the hash queue */
478 return;
480 nr_buffers_type[bh->b_list]--;
481 remove_from_hash_queue(bh);
482 remove_from_lru_list(bh);
485 static inline void put_last_lru(struct buffer_head * bh)
487 if (bh) {
488 struct buffer_head **bhp = &lru_list[bh->b_list];
490 if (bh == *bhp) {
491 *bhp = bh->b_next_free;
492 return;
495 if(bh->b_dev == B_FREE)
496 panic("Wrong block for lru list");
498 /* Add to back of free list. */
499 remove_from_lru_list(bh);
500 if(!*bhp) {
501 *bhp = bh;
502 (*bhp)->b_prev_free = bh;
505 bh->b_next_free = *bhp;
506 bh->b_prev_free = (*bhp)->b_prev_free;
507 (*bhp)->b_prev_free->b_next_free = bh;
508 (*bhp)->b_prev_free = bh;
512 static inline void put_last_free(struct buffer_head * bh)
514 if (bh) {
515 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
517 bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
519 /* Add to back of free list. */
520 if(!*bhp) {
521 *bhp = bh;
522 bh->b_prev_free = bh;
525 bh->b_next_free = *bhp;
526 bh->b_prev_free = (*bhp)->b_prev_free;
527 (*bhp)->b_prev_free->b_next_free = bh;
528 (*bhp)->b_prev_free = bh;
532 static inline void insert_into_queues(struct buffer_head * bh)
534 /* put at end of free list */
535 if(bh->b_dev == B_FREE) {
536 put_last_free(bh);
537 } else {
538 struct buffer_head **bhp = &lru_list[bh->b_list];
540 if(!*bhp) {
541 *bhp = bh;
542 bh->b_prev_free = bh;
545 if (bh->b_next_free)
546 panic("VFS: buffer LRU pointers corrupted");
548 bh->b_next_free = *bhp;
549 bh->b_prev_free = (*bhp)->b_prev_free;
550 (*bhp)->b_prev_free->b_next_free = bh;
551 (*bhp)->b_prev_free = bh;
553 nr_buffers_type[bh->b_list]++;
555 /* Put the buffer in new hash-queue if it has a device. */
556 if (bh->b_dev) {
557 struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
558 if((bh->b_next = *bhp) != NULL)
559 (*bhp)->b_pprev = &bh->b_next;
560 *bhp = bh;
561 bh->b_pprev = bhp; /* Exists in bh hashes. */
562 } else
563 bh->b_pprev = NULL; /* Not in bh hashes. */
567 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
569 struct buffer_head * next;
571 next = hash(dev,block);
572 for (;;) {
573 struct buffer_head *tmp = next;
574 if (!next)
575 break;
576 next = tmp->b_next;
577 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
578 continue;
579 next = tmp;
580 break;
582 return next;
586 * Why like this, I hear you say... The reason is race-conditions.
587 * As we don't lock buffers (unless we are reading them, that is),
588 * something might happen to it while we sleep (ie a read-error
589 * will force it bad). This shouldn't really happen currently, but
590 * the code is ready.
592 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
594 struct buffer_head * bh;
595 for (;;) {
596 bh = find_buffer(dev,block,size);
597 if (!bh)
598 break;
599 bh->b_count++;
600 bh->b_lru_time = jiffies;
601 if (!buffer_locked(bh))
602 break;
603 __wait_on_buffer(bh);
604 if (bh->b_dev == dev &&
605 bh->b_blocknr == block &&
606 bh->b_size == size)
607 break;
608 bh->b_count--;
610 return bh;
613 unsigned int get_hardblocksize(kdev_t dev)
616 * Get the hard sector size for the given device. If we don't know
617 * what it is, return 0.
619 if (hardsect_size[MAJOR(dev)] != NULL) {
620 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
621 if (blksize != 0)
622 return blksize;
626 * We don't know what the hardware sector size for this device is.
627 * Return 0 indicating that we don't know.
629 return 0;
632 void set_blocksize(kdev_t dev, int size)
634 extern int *blksize_size[];
635 int i, nlist;
636 struct buffer_head * bh, *bhnext;
638 if (!blksize_size[MAJOR(dev)])
639 return;
641 if (size > PAGE_SIZE)
642 size = 0;
644 switch (size) {
645 default: panic("Invalid blocksize passed to set_blocksize");
646 case 512: case 1024: case 2048: case 4096: case 8192: ;
649 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
650 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
651 return;
653 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
654 return;
655 sync_buffers(dev, 2);
656 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
658 /* We need to be quite careful how we do this - we are moving entries
659 * around on the free list, and we can get in a loop if we are not careful.
661 for(nlist = 0; nlist < NR_LIST; nlist++) {
662 bh = lru_list[nlist];
663 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
664 if(!bh)
665 break;
667 bhnext = bh->b_next_free;
668 if (bh->b_dev != dev)
669 continue;
670 if (bh->b_size == size)
671 continue;
672 bhnext->b_count++;
673 wait_on_buffer(bh);
674 bhnext->b_count--;
675 if (bh->b_dev == dev && bh->b_size != size) {
676 clear_bit(BH_Dirty, &bh->b_state);
677 clear_bit(BH_Uptodate, &bh->b_state);
678 clear_bit(BH_Req, &bh->b_state);
679 bh->b_flushtime = 0;
681 remove_from_hash_queue(bh);
687 * Find a candidate buffer to be reclaimed.
688 * N.B. Must search the entire BUF_LOCKED list rather than terminating
689 * when the first locked buffer is found. Buffers are unlocked at
690 * completion of IO, and under some conditions there may be (many)
691 * unlocked buffers after the first locked one.
693 static struct buffer_head *find_candidate(struct buffer_head *bh,
694 int *list_len, int size)
696 if (!bh)
697 goto no_candidate;
699 for (; (*list_len) > 0; bh = bh->b_next_free, (*list_len)--) {
700 if (size != bh->b_size) {
701 /* This provides a mechanism for freeing blocks
702 * of other sizes, this is necessary now that we
703 * no longer have the lav code.
705 try_to_free_buffer(bh,&bh,1);
706 if (!bh)
707 break;
708 continue;
710 else if (!bh->b_count &&
711 !buffer_locked(bh) &&
712 !buffer_protected(bh) &&
713 !buffer_dirty(bh))
714 return bh;
717 no_candidate:
718 return NULL;
721 static void refill_freelist(int size)
723 struct buffer_head * bh, * next;
724 struct buffer_head * candidate[BUF_DIRTY];
725 int buffers[BUF_DIRTY];
726 int i;
727 int needed, obtained=0;
729 refilled = 1;
731 /* We are going to try to locate this much memory. */
732 needed = bdf_prm.b_un.nrefill * size;
734 while ((nr_free_pages > min_free_pages*2) &&
735 grow_buffers(GFP_BUFFER, size)) {
736 obtained += PAGE_SIZE;
737 if (obtained >= needed)
738 return;
742 * Update the needed amount based on the number of potentially
743 * freeable buffers. We don't want to free more than one quarter
744 * of the available buffers.
746 i = (nr_buffers_type[BUF_CLEAN] + nr_buffers_type[BUF_LOCKED]) >> 2;
747 if (i < bdf_prm.b_un.nrefill) {
748 needed = i * size;
749 if (needed < PAGE_SIZE)
750 needed = PAGE_SIZE;
754 * OK, we cannot grow the buffer cache, now try to get some
755 * from the lru list.
757 repeat:
758 if (obtained >= needed)
759 return;
762 * First set the candidate pointers to usable buffers. This
763 * should be quick nearly all of the time. N.B. There must be
764 * no blocking calls after setting up the candidate[] array!
766 for (i = BUF_CLEAN; i<BUF_DIRTY; i++) {
767 buffers[i] = nr_buffers_type[i];
768 candidate[i] = find_candidate(lru_list[i], &buffers[i], size);
772 * Select the older of the available buffers until we reach our goal.
774 for (;;) {
775 i = BUF_CLEAN;
776 if (!candidate[BUF_CLEAN]) {
777 if (!candidate[BUF_LOCKED])
778 break;
779 i = BUF_LOCKED;
781 else if (candidate[BUF_LOCKED] &&
782 (candidate[BUF_LOCKED]->b_lru_time <
783 candidate[BUF_CLEAN ]->b_lru_time))
784 i = BUF_LOCKED;
786 * Free the selected buffer and get the next candidate.
788 bh = candidate[i];
789 next = bh->b_next_free;
791 obtained += bh->b_size;
792 remove_from_queues(bh);
793 put_last_free(bh);
794 if (obtained >= needed)
795 return;
797 if (--buffers[i] && bh != next)
798 candidate[i] = find_candidate(next, &buffers[i], size);
799 else
800 candidate[i] = NULL;
804 * If there are dirty buffers, do a non-blocking wake-up.
805 * This increases the chances of having buffers available
806 * for the next call ...
808 if (nr_buffers_type[BUF_DIRTY])
809 wakeup_bdflush(0);
812 * Allocate buffers to reach half our goal, if possible.
813 * Since the allocation doesn't block, there's no reason
814 * to search the buffer lists again. Then return if there
815 * are _any_ free buffers.
817 while (obtained < (needed >> 1) &&
818 nr_free_pages > min_free_pages + 5 &&
819 grow_buffers(GFP_BUFFER, size))
820 obtained += PAGE_SIZE;
822 if (free_list[BUFSIZE_INDEX(size)])
823 return;
826 * If there are dirty buffers, wait while bdflush writes
827 * them out. The buffers become locked, but we can just
828 * wait for one to unlock ...
830 if (nr_buffers_type[BUF_DIRTY])
831 wakeup_bdflush(1);
834 * In order to prevent a buffer shortage from exhausting
835 * the system's reserved pages, we force tasks to wait
836 * before using reserved pages for buffers. This is easily
837 * accomplished by waiting on an unused locked buffer.
839 if ((bh = lru_list[BUF_LOCKED]) != NULL) {
840 for (i = nr_buffers_type[BUF_LOCKED]; i--; bh = bh->b_next_free)
842 if (bh->b_size != size)
843 continue;
844 if (bh->b_count)
845 continue;
846 if (!buffer_locked(bh))
847 continue;
848 if (buffer_dirty(bh) || buffer_protected(bh))
849 continue;
850 if (MAJOR(bh->b_dev) == LOOP_MAJOR)
851 continue;
853 * We've found an unused, locked, non-dirty buffer of
854 * the correct size. Claim it so no one else can,
855 * then wait for it to unlock.
857 bh->b_count++;
858 wait_on_buffer(bh);
859 bh->b_count--;
861 * Loop back to harvest this (and maybe other) buffers.
863 goto repeat;
868 * Convert a reserved page into buffers ... should happen only rarely.
870 if (nr_free_pages > (min_free_pages >> 1) &&
871 grow_buffers(GFP_ATOMIC, size)) {
872 #ifdef BUFFER_DEBUG
873 printk("refill_freelist: used reserve page\n");
874 #endif
875 return;
879 * System is _very_ low on memory ... sleep and try later.
881 #ifdef BUFFER_DEBUG
882 printk("refill_freelist: task %s waiting for buffers\n", current->comm);
883 #endif
884 schedule();
885 goto repeat;
888 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
889 bh_end_io_t *handler, void *dev_id)
891 bh->b_count = 1;
892 bh->b_list = BUF_CLEAN;
893 bh->b_flushtime = 0;
894 bh->b_dev = dev;
895 bh->b_blocknr = block;
896 bh->b_end_io = handler;
897 bh->b_dev_id = dev_id;
900 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
902 mark_buffer_uptodate(bh, uptodate);
903 unlock_buffer(bh);
907 * Ok, this is getblk, and it isn't very clear, again to hinder
908 * race-conditions. Most of the code is seldom used, (ie repeating),
909 * so it should be much more efficient than it looks.
911 * The algorithm is changed: hopefully better, and an elusive bug removed.
913 * 14.02.92: changed it to sync dirty buffers a bit: better performance
914 * when the filesystem starts to get full of dirty blocks (I hope).
916 struct buffer_head * getblk(kdev_t dev, int block, int size)
918 struct buffer_head * bh;
919 int isize;
921 repeat:
922 bh = get_hash_table(dev, block, size);
923 if (bh) {
924 if (!buffer_dirty(bh)) {
925 if (buffer_uptodate(bh))
926 put_last_lru(bh);
927 bh->b_flushtime = 0;
929 set_bit(BH_Touched, &bh->b_state);
930 return bh;
933 isize = BUFSIZE_INDEX(size);
934 get_free:
935 bh = free_list[isize];
936 if (!bh)
937 goto refill;
938 remove_from_free_list(bh);
940 /* OK, FINALLY we know that this buffer is the only one of its kind,
941 * and that it's unused (b_count=0), unlocked, and clean.
943 init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
944 bh->b_lru_time = jiffies;
945 bh->b_state=(1<<BH_Touched);
946 insert_into_queues(bh);
947 return bh;
950 * If we block while refilling the free list, somebody may
951 * create the buffer first ... search the hashes again.
953 refill:
954 refill_freelist(size);
955 if (!find_buffer(dev,block,size))
956 goto get_free;
957 goto repeat;
960 void set_writetime(struct buffer_head * buf, int flag)
962 int newtime;
964 if (buffer_dirty(buf)) {
965 /* Move buffer to dirty list if jiffies is clear. */
966 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
967 bdf_prm.b_un.age_buffer);
968 if(!buf->b_flushtime || buf->b_flushtime > newtime)
969 buf->b_flushtime = newtime;
970 } else {
971 buf->b_flushtime = 0;
977 * Put a buffer into the appropriate list, without side-effects.
979 static inline void file_buffer(struct buffer_head *bh, int list)
981 remove_from_queues(bh);
982 bh->b_list = list;
983 insert_into_queues(bh);
987 * A buffer may need to be moved from one buffer list to another
988 * (e.g. in case it is not shared any more). Handle this.
990 void refile_buffer(struct buffer_head * buf)
992 int dispose;
994 if(buf->b_dev == B_FREE) {
995 printk("Attempt to refile free buffer\n");
996 return;
998 if (buffer_dirty(buf))
999 dispose = BUF_DIRTY;
1000 else if (buffer_locked(buf))
1001 dispose = BUF_LOCKED;
1002 else
1003 dispose = BUF_CLEAN;
1004 if(dispose != buf->b_list) {
1005 file_buffer(buf, dispose);
1006 if(dispose == BUF_DIRTY) {
1007 int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
1009 /* This buffer is dirty, maybe we need to start flushing.
1010 * If too high a percentage of the buffers are dirty...
1012 if (nr_buffers_type[BUF_DIRTY] > too_many)
1013 wakeup_bdflush(0);
1015 /* If this is a loop device, and
1016 * more than half of the buffers are dirty...
1017 * (Prevents no-free-buffers deadlock with loop device.)
1019 if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
1020 nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
1021 wakeup_bdflush(1);
1027 * Release a buffer head
1029 void __brelse(struct buffer_head * buf)
1031 wait_on_buffer(buf);
1033 /* If dirty, mark the time this buffer should be written back. */
1034 set_writetime(buf, 0);
1035 refile_buffer(buf);
1037 if (buf->b_count) {
1038 buf->b_count--;
1039 return;
1041 printk("VFS: brelse: Trying to free free buffer\n");
1045 * bforget() is like brelse(), except it removes the buffer
1046 * from the hash-queues (so that it won't be re-used if it's
1047 * shared).
1049 void __bforget(struct buffer_head * buf)
1051 wait_on_buffer(buf);
1052 mark_buffer_clean(buf);
1053 clear_bit(BH_Protected, &buf->b_state);
1054 buf->b_count--;
1055 remove_from_hash_queue(buf);
1056 buf->b_dev = NODEV;
1057 refile_buffer(buf);
1061 * bread() reads a specified block and returns the buffer that contains
1062 * it. It returns NULL if the block was unreadable.
1064 struct buffer_head * bread(kdev_t dev, int block, int size)
1066 struct buffer_head * bh;
1068 if (!(bh = getblk(dev, block, size))) {
1069 printk("VFS: bread: impossible error\n");
1070 return NULL;
1072 if (buffer_uptodate(bh))
1073 return bh;
1074 ll_rw_block(READ, 1, &bh);
1075 wait_on_buffer(bh);
1076 if (buffer_uptodate(bh))
1077 return bh;
1078 brelse(bh);
1079 return NULL;
1083 * Ok, breada can be used as bread, but additionally to mark other
1084 * blocks for reading as well. End the argument list with a negative
1085 * number.
1088 #define NBUF 16
1090 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1091 unsigned int pos, unsigned int filesize)
1093 struct buffer_head * bhlist[NBUF];
1094 unsigned int blocks;
1095 struct buffer_head * bh;
1096 int index;
1097 int i, j;
1099 if (pos >= filesize)
1100 return NULL;
1102 if (block < 0 || !(bh = getblk(dev,block,bufsize)))
1103 return NULL;
1105 index = BUFSIZE_INDEX(bh->b_size);
1107 if (buffer_uptodate(bh))
1108 return(bh);
1109 else ll_rw_block(READ, 1, &bh);
1111 blocks = (filesize - pos) >> (9+index);
1113 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1114 blocks = read_ahead[MAJOR(dev)] >> index;
1115 if (blocks > NBUF)
1116 blocks = NBUF;
1118 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1121 bhlist[0] = bh;
1122 j = 1;
1123 for(i=1; i<blocks; i++) {
1124 bh = getblk(dev,block+i,bufsize);
1125 if (buffer_uptodate(bh)) {
1126 brelse(bh);
1127 break;
1129 else bhlist[j++] = bh;
1132 /* Request the read for these buffers, and then release them. */
1133 if (j>1)
1134 ll_rw_block(READA, (j-1), bhlist+1);
1135 for(i=1; i<j; i++)
1136 brelse(bhlist[i]);
1138 /* Wait for this buffer, and then continue on. */
1139 bh = bhlist[0];
1140 wait_on_buffer(bh);
1141 if (buffer_uptodate(bh))
1142 return bh;
1143 brelse(bh);
1144 return NULL;
1147 static void put_unused_buffer_head(struct buffer_head * bh)
1149 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1150 nr_buffer_heads--;
1151 kmem_cache_free(bh_cachep, bh);
1152 return;
1155 memset(bh,0,sizeof(*bh));
1156 nr_unused_buffer_heads++;
1157 bh->b_next_free = unused_list;
1158 unused_list = bh;
1159 if (!waitqueue_active(&buffer_wait))
1160 return;
1161 wake_up(&buffer_wait);
1165 * We can't put completed temporary IO buffer_heads directly onto the
1166 * unused_list when they become unlocked, since the device driver
1167 * end_request routines still expect access to the buffer_head's
1168 * fields after the final unlock. So, the device driver puts them on
1169 * the reuse_list instead once IO completes, and we recover these to
1170 * the unused_list here.
1172 static inline void recover_reusable_buffer_heads(void)
1174 struct buffer_head *head;
1176 head = xchg(&reuse_list, NULL);
1178 while (head) {
1179 struct buffer_head *bh = head;
1180 head = head->b_next_free;
1181 put_unused_buffer_head(bh);
1186 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1187 * no-buffer-head deadlock. Return NULL on failure; waiting for
1188 * buffer heads is now handled in create_buffers().
1190 static struct buffer_head * get_unused_buffer_head(int async)
1192 struct buffer_head * bh;
1194 recover_reusable_buffer_heads();
1195 if (nr_unused_buffer_heads > NR_RESERVED) {
1196 bh = unused_list;
1197 unused_list = bh->b_next_free;
1198 nr_unused_buffer_heads--;
1199 return bh;
1202 /* This is critical. We can't swap out pages to get
1203 * more buffer heads, because the swap-out may need
1204 * more buffer-heads itself. Thus SLAB_ATOMIC.
1206 if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
1207 memset(bh, 0, sizeof(*bh));
1208 nr_buffer_heads++;
1209 return bh;
1213 * If we need an async buffer, use the reserved buffer heads.
1215 if (async && unused_list) {
1216 bh = unused_list;
1217 unused_list = bh->b_next_free;
1218 nr_unused_buffer_heads--;
1219 return bh;
1222 #if 0
1224 * (Pending further analysis ...)
1225 * Ordinary (non-async) requests can use a different memory priority
1226 * to free up pages. Any swapping thus generated will use async
1227 * buffer heads.
1229 if(!async &&
1230 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1231 memset(bh, 0, sizeof(*bh));
1232 nr_buffer_heads++;
1233 return bh;
1235 #endif
1237 return NULL;
1241 * Create the appropriate buffers when given a page for data area and
1242 * the size of each buffer.. Use the bh->b_this_page linked list to
1243 * follow the buffers created. Return NULL if unable to create more
1244 * buffers.
1245 * The async flag is used to differentiate async IO (paging, swapping)
1246 * from ordinary buffer allocations, and only async requests are allowed
1247 * to sleep waiting for buffer heads.
1249 static struct buffer_head * create_buffers(unsigned long page,
1250 unsigned long size, int async)
1252 struct wait_queue wait = { current, NULL };
1253 struct buffer_head *bh, *head;
1254 long offset;
1256 try_again:
1257 head = NULL;
1258 offset = PAGE_SIZE;
1259 while ((offset -= size) >= 0) {
1260 bh = get_unused_buffer_head(async);
1261 if (!bh)
1262 goto no_grow;
1264 bh->b_dev = B_FREE; /* Flag as unused */
1265 bh->b_this_page = head;
1266 head = bh;
1268 bh->b_state = 0;
1269 bh->b_next_free = NULL;
1270 bh->b_count = 0;
1271 bh->b_size = size;
1273 bh->b_data = (char *) (page+offset);
1274 bh->b_list = 0;
1276 return head;
1278 * In case anything failed, we just free everything we got.
1280 no_grow:
1281 bh = head;
1282 while (bh) {
1283 head = bh;
1284 bh = bh->b_this_page;
1285 put_unused_buffer_head(head);
1289 * Return failure for non-async IO requests. Async IO requests
1290 * are not allowed to fail, so we have to wait until buffer heads
1291 * become available. But we don't want tasks sleeping with
1292 * partially complete buffers, so all were released above.
1294 if (!async)
1295 return NULL;
1297 /* Uhhuh. We're _really_ low on memory. Now we just
1298 * wait for old buffer heads to become free due to
1299 * finishing IO. Since this is an async request and
1300 * the reserve list is empty, we're sure there are
1301 * async buffer heads in use.
1303 run_task_queue(&tq_disk);
1306 * Set our state for sleeping, then check again for buffer heads.
1307 * This ensures we won't miss a wake_up from an interrupt.
1309 add_wait_queue(&buffer_wait, &wait);
1310 current->state = TASK_UNINTERRUPTIBLE;
1311 recover_reusable_buffer_heads();
1312 schedule();
1313 remove_wait_queue(&buffer_wait, &wait);
1314 current->state = TASK_RUNNING;
1315 goto try_again;
1318 /* Run the hooks that have to be done when a page I/O has completed. */
1319 static inline void after_unlock_page (struct page * page)
1321 if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1322 atomic_dec(&nr_async_pages);
1323 #ifdef DEBUG_SWAP
1324 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1325 (char *) page_address(page),
1326 atomic_read(&nr_async_pages));
1327 #endif
1329 if (test_and_clear_bit(PG_free_after, &page->flags))
1330 __free_page(page);
1334 * Free all temporary buffers belonging to a page.
1335 * This needs to be called with interrupts disabled.
1337 static inline void free_async_buffers (struct buffer_head * bh)
1339 struct buffer_head * tmp;
1341 tmp = bh;
1342 do {
1343 tmp->b_next_free = xchg(&reuse_list, NULL);
1344 reuse_list = tmp;
1345 tmp = tmp->b_this_page;
1346 } while (tmp != bh);
1349 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1351 unsigned long flags;
1352 struct buffer_head *tmp;
1353 struct page *page;
1355 mark_buffer_uptodate(bh, uptodate);
1356 unlock_buffer(bh);
1358 /* This is a temporary buffer used for page I/O. */
1359 page = mem_map + MAP_NR(bh->b_data);
1360 if (!PageLocked(page))
1361 goto not_locked;
1362 if (bh->b_count != 1)
1363 goto bad_count;
1365 if (!test_bit(BH_Uptodate, &bh->b_state))
1366 set_bit(PG_error, &page->flags);
1369 * Be _very_ careful from here on. Bad things can happen if
1370 * two buffer heads end IO at almost the same time and both
1371 * decide that the page is now completely done.
1373 * Async buffer_heads are here only as labels for IO, and get
1374 * thrown away once the IO for this page is complete. IO is
1375 * deemed complete once all buffers have been visited
1376 * (b_count==0) and are now unlocked. We must make sure that
1377 * only the _last_ buffer that decrements its count is the one
1378 * that free's the page..
1380 save_flags(flags);
1381 cli();
1382 bh->b_count--;
1383 tmp = bh;
1384 do {
1385 if (tmp->b_count)
1386 goto still_busy;
1387 tmp = tmp->b_this_page;
1388 } while (tmp != bh);
1390 /* OK, the async IO on this page is complete. */
1391 free_async_buffers(bh);
1392 restore_flags(flags);
1393 clear_bit(PG_locked, &page->flags);
1394 wake_up(&page->wait);
1395 after_unlock_page(page);
1396 wake_up(&buffer_wait);
1397 return;
1399 still_busy:
1400 restore_flags(flags);
1401 return;
1403 not_locked:
1404 printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1405 return;
1407 bad_count:
1408 printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1409 return;
1413 * Start I/O on a page.
1414 * This function expects the page to be locked and may return before I/O is complete.
1415 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1417 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1419 struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1420 int block, nr;
1422 if (!PageLocked(page))
1423 panic("brw_page: page not locked for I/O");
1424 clear_bit(PG_uptodate, &page->flags);
1425 clear_bit(PG_error, &page->flags);
1427 * Allocate async buffer heads pointing to this page, just for I/O.
1428 * They do _not_ show up in the buffer hash table!
1429 * They are _not_ registered in page->buffers either!
1431 bh = create_buffers(page_address(page), size, 1);
1432 if (!bh) {
1433 /* WSH: exit here leaves page->count incremented */
1434 clear_bit(PG_locked, &page->flags);
1435 wake_up(&page->wait);
1436 return -ENOMEM;
1438 nr = 0;
1439 next = bh;
1440 do {
1441 struct buffer_head * tmp;
1442 block = *(b++);
1444 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1445 set_bit(BH_Uptodate, &next->b_state);
1448 * When we use bmap, we define block zero to represent
1449 * a hole. ll_rw_page, however, may legitimately
1450 * access block zero, and we need to distinguish the
1451 * two cases.
1453 if (bmap && !block) {
1454 memset(next->b_data, 0, size);
1455 next->b_count--;
1456 continue;
1458 tmp = get_hash_table(dev, block, size);
1459 if (tmp) {
1460 if (!buffer_uptodate(tmp)) {
1461 if (rw == READ)
1462 ll_rw_block(READ, 1, &tmp);
1463 wait_on_buffer(tmp);
1465 if (rw == READ)
1466 memcpy(next->b_data, tmp->b_data, size);
1467 else {
1468 memcpy(tmp->b_data, next->b_data, size);
1469 mark_buffer_dirty(tmp, 0);
1471 brelse(tmp);
1472 next->b_count--;
1473 continue;
1475 if (rw == READ)
1476 clear_bit(BH_Uptodate, &next->b_state);
1477 else
1478 set_bit(BH_Dirty, &next->b_state);
1479 arr[nr++] = next;
1480 } while (prev = next, (next = next->b_this_page) != NULL);
1481 prev->b_this_page = bh;
1483 if (nr) {
1484 ll_rw_block(rw, nr, arr);
1485 /* The rest of the work is done in mark_buffer_uptodate()
1486 * and unlock_buffer(). */
1487 } else {
1488 unsigned long flags;
1489 clear_bit(PG_locked, &page->flags);
1490 set_bit(PG_uptodate, &page->flags);
1491 wake_up(&page->wait);
1492 save_flags(flags);
1493 cli();
1494 free_async_buffers(bh);
1495 restore_flags(flags);
1496 after_unlock_page(page);
1498 ++current->maj_flt;
1499 return 0;
1503 * This is called by end_request() when I/O has completed.
1505 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1507 if (on) {
1508 struct buffer_head *tmp = bh;
1509 set_bit(BH_Uptodate, &bh->b_state);
1510 /* If a page has buffers and all these buffers are uptodate,
1511 * then the page is uptodate. */
1512 do {
1513 if (!test_bit(BH_Uptodate, &tmp->b_state))
1514 return;
1515 tmp=tmp->b_this_page;
1516 } while (tmp && tmp != bh);
1517 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1518 return;
1520 clear_bit(BH_Uptodate, &bh->b_state);
1524 * Generic "readpage" function for block devices that have the normal
1525 * bmap functionality. This is most of the block device filesystems.
1526 * Reads the page asynchronously --- the unlock_buffer() and
1527 * mark_buffer_uptodate() functions propagate buffer state into the
1528 * page struct once IO has completed.
1530 int generic_readpage(struct file * file, struct page * page)
1532 struct dentry *dentry = file->f_dentry;
1533 struct inode *inode = dentry->d_inode;
1534 unsigned long block;
1535 int *p, nr[PAGE_SIZE/512];
1536 int i;
1538 atomic_inc(&page->count);
1539 set_bit(PG_locked, &page->flags);
1540 set_bit(PG_free_after, &page->flags);
1542 i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1543 block = page->offset >> inode->i_sb->s_blocksize_bits;
1544 p = nr;
1545 do {
1546 *p = inode->i_op->bmap(inode, block);
1547 i--;
1548 block++;
1549 p++;
1550 } while (i > 0);
1552 /* IO start */
1553 brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1554 return 0;
1558 * Try to increase the number of buffers available: the size argument
1559 * is used to determine what kind of buffers we want.
1561 static int grow_buffers(int pri, int size)
1563 unsigned long page;
1564 struct buffer_head *bh, *tmp;
1565 struct buffer_head * insert_point;
1566 int isize;
1568 if ((size & 511) || (size > PAGE_SIZE)) {
1569 printk("VFS: grow_buffers: size = %d\n",size);
1570 return 0;
1573 if (!(page = __get_free_page(pri)))
1574 return 0;
1575 bh = create_buffers(page, size, 0);
1576 if (!bh) {
1577 free_page(page);
1578 return 0;
1581 isize = BUFSIZE_INDEX(size);
1582 insert_point = free_list[isize];
1584 tmp = bh;
1585 while (1) {
1586 if (insert_point) {
1587 tmp->b_next_free = insert_point->b_next_free;
1588 tmp->b_prev_free = insert_point;
1589 insert_point->b_next_free->b_prev_free = tmp;
1590 insert_point->b_next_free = tmp;
1591 } else {
1592 tmp->b_prev_free = tmp;
1593 tmp->b_next_free = tmp;
1595 insert_point = tmp;
1596 ++nr_buffers;
1597 if (tmp->b_this_page)
1598 tmp = tmp->b_this_page;
1599 else
1600 break;
1602 tmp->b_this_page = bh;
1603 free_list[isize] = bh;
1604 mem_map[MAP_NR(page)].buffers = bh;
1605 buffermem += PAGE_SIZE;
1606 return 1;
1610 /* =========== Reduce the buffer memory ============= */
1612 static inline int buffer_waiting(struct buffer_head * bh)
1614 return waitqueue_active(&bh->b_wait);
1618 * try_to_free_buffer() checks if all the buffers on this particular page
1619 * are unused, and free's the page if so.
1621 int try_to_free_buffer(struct buffer_head * bh, struct buffer_head ** bhp,
1622 int priority)
1624 unsigned long page;
1625 struct buffer_head * tmp, * p;
1627 *bhp = bh;
1628 page = (unsigned long) bh->b_data;
1629 page &= PAGE_MASK;
1630 tmp = bh;
1631 do {
1632 if (!tmp)
1633 return 0;
1634 if (tmp->b_count || buffer_protected(tmp) ||
1635 buffer_dirty(tmp) || buffer_locked(tmp) ||
1636 buffer_waiting(tmp))
1637 return 0;
1638 if (priority && buffer_touched(tmp))
1639 return 0;
1640 tmp = tmp->b_this_page;
1641 } while (tmp != bh);
1642 tmp = bh;
1643 do {
1644 p = tmp;
1645 tmp = tmp->b_this_page;
1646 nr_buffers--;
1647 if (p == *bhp) {
1648 *bhp = p->b_prev_free;
1649 if (p == *bhp) /* Was this the last in the list? */
1650 *bhp = NULL;
1652 remove_from_queues(p);
1653 put_unused_buffer_head(p);
1654 } while (tmp != bh);
1655 buffermem -= PAGE_SIZE;
1656 mem_map[MAP_NR(page)].buffers = NULL;
1657 free_page(page);
1658 return 1;
1661 /* ================== Debugging =================== */
1663 void show_buffers(void)
1665 struct buffer_head * bh;
1666 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1667 int protected = 0;
1668 int nlist;
1669 static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1671 printk("Buffer memory: %6dkB\n",buffermem>>10);
1672 printk("Buffer heads: %6d\n",nr_buffer_heads);
1673 printk("Buffer blocks: %6d\n",nr_buffers);
1675 for(nlist = 0; nlist < NR_LIST; nlist++) {
1676 found = locked = dirty = used = lastused = protected = 0;
1677 bh = lru_list[nlist];
1678 if(!bh) continue;
1680 do {
1681 found++;
1682 if (buffer_locked(bh))
1683 locked++;
1684 if (buffer_protected(bh))
1685 protected++;
1686 if (buffer_dirty(bh))
1687 dirty++;
1688 if (bh->b_count)
1689 used++, lastused = found;
1690 bh = bh->b_next_free;
1691 } while (bh != lru_list[nlist]);
1692 printk("%8s: %d buffers, %d used (last=%d), "
1693 "%d locked, %d protected, %d dirty\n",
1694 buf_types[nlist], found, used, lastused,
1695 locked, protected, dirty);
1700 /* ===================== Init ======================= */
1703 * allocate the hash table and init the free list
1704 * Use gfp() for the hash table to decrease TLB misses, use
1705 * SLAB cache for buffer heads.
1707 void buffer_init(void)
1709 hash_table = (struct buffer_head **)
1710 __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER);
1711 if (!hash_table)
1712 panic("Failed to allocate buffer hash table\n");
1713 memset(hash_table,0,NR_HASH*sizeof(struct buffer_head *));
1715 bh_cachep = kmem_cache_create("buffer_head",
1716 sizeof(struct buffer_head),
1718 SLAB_HWCACHE_ALIGN, NULL, NULL);
1719 if(!bh_cachep)
1720 panic("Cannot create buffer head SLAB cache\n");
1722 * Allocate the reserved buffer heads.
1724 while (nr_buffer_heads < NR_RESERVED) {
1725 struct buffer_head * bh;
1727 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1728 if (!bh)
1729 break;
1730 put_unused_buffer_head(bh);
1731 nr_buffer_heads++;
1734 lru_list[BUF_CLEAN] = 0;
1735 grow_buffers(GFP_KERNEL, BLOCK_SIZE);
1739 /* ====================== bdflush support =================== */
1741 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1742 * response to dirty buffers. Once this process is activated, we write back
1743 * a limited number of buffers to the disks and then go back to sleep again.
1745 static struct wait_queue * bdflush_wait = NULL;
1746 static struct wait_queue * bdflush_done = NULL;
1747 struct task_struct *bdflush_tsk = 0;
1749 void wakeup_bdflush(int wait)
1751 if (current == bdflush_tsk)
1752 return;
1753 wake_up(&bdflush_wait);
1754 if (wait) {
1755 run_task_queue(&tq_disk);
1756 sleep_on(&bdflush_done);
1762 * Here we attempt to write back old buffers. We also try to flush inodes
1763 * and supers as well, since this function is essentially "update", and
1764 * otherwise there would be no way of ensuring that these quantities ever
1765 * get written back. Ideally, we would have a timestamp on the inodes
1766 * and superblocks so that we could write back only the old ones as well
1769 asmlinkage int sync_old_buffers(void)
1771 int i;
1772 int ndirty, nwritten;
1773 int nlist;
1774 int ncount;
1775 struct buffer_head * bh, *next;
1777 sync_supers(0);
1778 sync_inodes(0);
1780 ncount = 0;
1781 #ifdef DEBUG
1782 for(nlist = 0; nlist < NR_LIST; nlist++)
1783 #else
1784 for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1785 #endif
1787 ndirty = 0;
1788 nwritten = 0;
1789 repeat:
1791 bh = lru_list[nlist];
1792 if(bh)
1793 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1794 /* We may have stalled while waiting for I/O to complete. */
1795 if(bh->b_list != nlist) goto repeat;
1796 next = bh->b_next_free;
1797 if(!lru_list[nlist]) {
1798 printk("Dirty list empty %d\n", i);
1799 break;
1802 /* Clean buffer on dirty list? Refile it */
1803 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1805 refile_buffer(bh);
1806 continue;
1809 if (buffer_locked(bh) || !buffer_dirty(bh))
1810 continue;
1811 ndirty++;
1812 if(bh->b_flushtime > jiffies) continue;
1813 nwritten++;
1814 next->b_count++;
1815 bh->b_count++;
1816 bh->b_flushtime = 0;
1817 #ifdef DEBUG
1818 if(nlist != BUF_DIRTY) ncount++;
1819 #endif
1820 ll_rw_block(WRITE, 1, &bh);
1821 bh->b_count--;
1822 next->b_count--;
1825 run_task_queue(&tq_disk);
1826 #ifdef DEBUG
1827 if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1828 printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1829 #endif
1830 run_task_queue(&tq_disk);
1831 return 0;
1835 /* This is the interface to bdflush. As we get more sophisticated, we can
1836 * pass tuning parameters to this "process", to adjust how it behaves.
1837 * We would want to verify each parameter, however, to make sure that it
1838 * is reasonable. */
1840 asmlinkage int sys_bdflush(int func, long data)
1842 int i, error = -EPERM;
1844 lock_kernel();
1845 if (!suser())
1846 goto out;
1848 if (func == 1) {
1849 error = sync_old_buffers();
1850 goto out;
1853 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1854 if (func >= 2) {
1855 i = (func-2) >> 1;
1856 error = -EINVAL;
1857 if (i < 0 || i >= N_PARAM)
1858 goto out;
1859 if((func & 1) == 0) {
1860 error = put_user(bdf_prm.data[i], (int*)data);
1861 goto out;
1863 if (data < bdflush_min[i] || data > bdflush_max[i])
1864 goto out;
1865 bdf_prm.data[i] = data;
1866 error = 0;
1867 goto out;
1870 /* Having func 0 used to launch the actual bdflush and then never
1871 * return (unless explicitly killed). We return zero here to
1872 * remain semi-compatible with present update(8) programs.
1874 error = 0;
1875 out:
1876 unlock_kernel();
1877 return error;
1880 /* This is the actual bdflush daemon itself. It used to be started from
1881 * the syscall above, but now we launch it ourselves internally with
1882 * kernel_thread(...) directly after the first thread in init/main.c */
1884 /* To prevent deadlocks for a loop device:
1885 * 1) Do non-blocking writes to loop (avoids deadlock with running
1886 * out of request blocks).
1887 * 2) But do a blocking write if the only dirty buffers are loop buffers
1888 * (otherwise we go into an infinite busy-loop).
1889 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1890 * with running out of free buffers for loop's "real" device).
1892 int bdflush(void * unused)
1894 int i;
1895 int ndirty;
1896 int nlist;
1897 int ncount;
1898 struct buffer_head * bh, *next;
1899 int major;
1900 int wrta_cmd = WRITEA; /* non-blocking write for LOOP */
1903 * We have a bare-bones task_struct, and really should fill
1904 * in a few more things so "top" and /proc/2/{exe,root,cwd}
1905 * display semi-sane things. Not real crucial though...
1908 current->session = 1;
1909 current->pgrp = 1;
1910 sprintf(current->comm, "kflushd");
1911 bdflush_tsk = current;
1914 * As a kernel thread we want to tamper with system buffers
1915 * and other internals and thus be subject to the SMP locking
1916 * rules. (On a uniprocessor box this does nothing).
1918 lock_kernel();
1920 for (;;) {
1921 #ifdef DEBUG
1922 printk("bdflush() activated...");
1923 #endif
1925 CHECK_EMERGENCY_SYNC
1927 ncount = 0;
1928 #ifdef DEBUG
1929 for(nlist = 0; nlist < NR_LIST; nlist++)
1930 #else
1931 for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1932 #endif
1934 ndirty = 0;
1935 refilled = 0;
1936 repeat:
1938 bh = lru_list[nlist];
1939 if(bh)
1940 for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1941 bh = next) {
1942 /* We may have stalled while waiting for I/O to complete. */
1943 if(bh->b_list != nlist) goto repeat;
1944 next = bh->b_next_free;
1945 if(!lru_list[nlist]) {
1946 printk("Dirty list empty %d\n", i);
1947 break;
1950 /* Clean buffer on dirty list? Refile it */
1951 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1953 refile_buffer(bh);
1954 continue;
1957 if (buffer_locked(bh) || !buffer_dirty(bh))
1958 continue;
1959 major = MAJOR(bh->b_dev);
1960 /* Should we write back buffers that are shared or not??
1961 currently dirty buffers are not shared, so it does not matter */
1962 if (refilled && major == LOOP_MAJOR)
1963 continue;
1964 next->b_count++;
1965 bh->b_count++;
1966 ndirty++;
1967 bh->b_flushtime = 0;
1968 if (major == LOOP_MAJOR) {
1969 ll_rw_block(wrta_cmd,1, &bh);
1970 wrta_cmd = WRITEA;
1971 if (buffer_dirty(bh))
1972 --ndirty;
1974 else
1975 ll_rw_block(WRITE, 1, &bh);
1976 #ifdef DEBUG
1977 if(nlist != BUF_DIRTY) ncount++;
1978 #endif
1979 bh->b_count--;
1980 next->b_count--;
1983 #ifdef DEBUG
1984 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
1985 printk("sleeping again.\n");
1986 #endif
1987 /* If we didn't write anything, but there are still
1988 * dirty buffers, then make the next write to a
1989 * loop device to be a blocking write.
1990 * This lets us block--which we _must_ do! */
1991 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1992 wrta_cmd = WRITE;
1993 continue;
1995 run_task_queue(&tq_disk);
1996 wake_up(&bdflush_done);
1998 /* If there are still a lot of dirty buffers around, skip the sleep
1999 and flush some more */
2000 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
2001 spin_lock_irq(&current->sigmask_lock);
2002 flush_signals(current);
2003 spin_unlock_irq(&current->sigmask_lock);
2005 interruptible_sleep_on(&bdflush_wait);