Import 2.1.118
[davej-history.git] / fs / buffer.c
blob103caefa32a65206f00887c1dd8b537fd1999f65
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 #include <linux/sched.h>
28 #include <linux/kernel.h>
29 #include <linux/major.h>
30 #include <linux/string.h>
31 #include <linux/locks.h>
32 #include <linux/errno.h>
33 #include <linux/malloc.h>
34 #include <linux/slab.h>
35 #include <linux/pagemap.h>
36 #include <linux/swap.h>
37 #include <linux/swapctl.h>
38 #include <linux/smp.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
47 #include <asm/system.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
52 #define NR_SIZES 7
53 static char buffersize_index[65] =
54 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
55 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
56 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 6};
60 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
61 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
62 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
63 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
64 number of unused buffer heads */
67 * Hash table mask..
69 static unsigned long bh_hash_mask = 0;
71 static int grow_buffers(int pri, int size);
73 static struct buffer_head ** hash_table;
74 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
75 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
77 static kmem_cache_t *bh_cachep;
79 static struct buffer_head * unused_list = NULL;
80 static struct buffer_head * reuse_list = NULL;
81 static struct wait_queue * buffer_wait = NULL;
83 static int nr_buffers = 0;
84 static int nr_buffers_type[NR_LIST] = {0,};
85 static int nr_buffer_heads = 0;
86 static int nr_unused_buffer_heads = 0;
87 static int refilled = 0; /* Set NZ when a buffer freelist is refilled
88 this is used by the loop device */
90 /* This is used by some architectures to estimate available memory. */
91 int buffermem = 0;
93 /* Here is the parameter block for the bdflush process. If you add or
94 * remove any of the parameters, make sure to update kernel/sysctl.c.
97 #define N_PARAM 9
99 /* The dummy values in this structure are left in there for compatibility
100 * with old programs that play with the /proc entries.
102 union bdflush_param{
103 struct {
104 int nfract; /* Percentage of buffer cache dirty to
105 activate bdflush */
106 int ndirty; /* Maximum number of dirty blocks to write out per
107 wake-cycle */
108 int nrefill; /* Number of clean buffers to try to obtain
109 each time we call refill */
110 int nref_dirt; /* Dirty buffer threshold for activating bdflush
111 when trying to refill buffers. */
112 int dummy1; /* unused */
113 int age_buffer; /* Time for normal buffer to age before
114 we flush it */
115 int age_super; /* Time for superblock to age before we
116 flush it */
117 int dummy2; /* unused */
118 int dummy3; /* unused */
119 } b_un;
120 unsigned int data[N_PARAM];
121 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
123 /* These are the min and max parameter values that we will allow to be assigned */
124 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 100, 100, 1, 1};
125 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 60000, 60000, 2047, 5};
127 void wakeup_bdflush(int);
130 * Rewrote the wait-routines to use the "new" wait-queue functionality,
131 * and getting rid of the cli-sti pairs. The wait-queue routines still
132 * need cli-sti, but now it's just a couple of 386 instructions or so.
134 * Note that the real wait_on_buffer() is an inline function that checks
135 * if 'b_wait' is set before calling this, so that the queues aren't set
136 * up unnecessarily.
138 void __wait_on_buffer(struct buffer_head * bh)
140 struct task_struct *tsk = current;
141 struct wait_queue wait;
143 bh->b_count++;
144 wait.task = tsk;
145 add_wait_queue(&bh->b_wait, &wait);
146 repeat:
147 tsk->state = TASK_UNINTERRUPTIBLE;
148 run_task_queue(&tq_disk);
149 if (buffer_locked(bh)) {
150 schedule();
151 goto repeat;
153 tsk->state = TASK_RUNNING;
154 remove_wait_queue(&bh->b_wait, &wait);
155 bh->b_count--;
158 /* Call sync_buffers with wait!=0 to ensure that the call does not
159 * return until all buffer writes have completed. Sync() may return
160 * before the writes have finished; fsync() may not.
163 /* Godamity-damn. Some buffers (bitmaps for filesystems)
164 * spontaneously dirty themselves without ever brelse being called.
165 * We will ultimately want to put these in a separate list, but for
166 * now we search all of the lists for dirty buffers.
168 static int sync_buffers(kdev_t dev, int wait)
170 int i, retry, pass = 0, err = 0;
171 struct buffer_head * bh, *next;
173 /* One pass for no-wait, three for wait:
174 * 0) write out all dirty, unlocked buffers;
175 * 1) write out all dirty buffers, waiting if locked;
176 * 2) wait for completion by waiting for all buffers to unlock.
178 do {
179 retry = 0;
180 repeat:
181 /* We search all lists as a failsafe mechanism, not because we expect
182 * there to be dirty buffers on any of the other lists.
184 bh = lru_list[BUF_DIRTY];
185 if (!bh)
186 goto repeat2;
187 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
188 if (bh->b_list != BUF_DIRTY)
189 goto repeat;
190 next = bh->b_next_free;
191 if (!lru_list[BUF_DIRTY])
192 break;
193 if (dev && bh->b_dev != dev)
194 continue;
195 if (buffer_locked(bh)) {
196 /* Buffer is locked; skip it unless wait is
197 * requested AND pass > 0.
199 if (!wait || !pass) {
200 retry = 1;
201 continue;
203 wait_on_buffer (bh);
204 goto repeat;
207 /* If an unlocked buffer is not uptodate, there has
208 * been an IO error. Skip it.
210 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
211 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
212 err = -EIO;
213 continue;
216 /* Don't write clean buffers. Don't write ANY buffers
217 * on the third pass.
219 if (!buffer_dirty(bh) || pass >= 2)
220 continue;
222 /* Don't bother about locked buffers.
224 * XXX We checked if it was locked above and there is no
225 * XXX way we could have slept in between. -DaveM
227 if (buffer_locked(bh))
228 continue;
229 bh->b_count++;
230 next->b_count++;
231 bh->b_flushtime = 0;
232 ll_rw_block(WRITE, 1, &bh);
233 bh->b_count--;
234 next->b_count--;
235 retry = 1;
238 repeat2:
239 bh = lru_list[BUF_LOCKED];
240 if (!bh)
241 break;
242 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
243 if (bh->b_list != BUF_LOCKED)
244 goto repeat2;
245 next = bh->b_next_free;
246 if (!lru_list[BUF_LOCKED])
247 break;
248 if (dev && bh->b_dev != dev)
249 continue;
250 if (buffer_locked(bh)) {
251 /* Buffer is locked; skip it unless wait is
252 * requested AND pass > 0.
254 if (!wait || !pass) {
255 retry = 1;
256 continue;
258 wait_on_buffer (bh);
259 goto repeat2;
263 /* If we are waiting for the sync to succeed, and if any dirty
264 * blocks were written, then repeat; on the second pass, only
265 * wait for buffers being written (do not pass to write any
266 * more buffers on the second pass).
268 } while (wait && retry && ++pass<=2);
269 return err;
272 void sync_dev(kdev_t dev)
274 sync_buffers(dev, 0);
275 sync_supers(dev);
276 sync_inodes(dev);
277 sync_buffers(dev, 0);
278 DQUOT_SYNC(dev);
280 * FIXME(eric) we need to sync the physical devices here.
281 * This is because some (scsi) controllers have huge amounts of
282 * cache onboard (hundreds of Mb), and we need to instruct
283 * them to commit all of the dirty memory to disk, and we should
284 * not return until this has happened.
286 * This would need to get implemented by going through the assorted
287 * layers so that each block major number can be synced, and this
288 * would call down into the upper and mid-layer scsi.
292 int fsync_dev(kdev_t dev)
294 sync_buffers(dev, 0);
295 sync_supers(dev);
296 sync_inodes(dev);
297 DQUOT_SYNC(dev);
298 return sync_buffers(dev, 1);
301 asmlinkage int sys_sync(void)
303 lock_kernel();
304 fsync_dev(0);
305 unlock_kernel();
306 return 0;
310 * filp may be NULL if called via the msync of a vma.
313 int file_fsync(struct file *filp, struct dentry *dentry)
315 struct inode * inode = dentry->d_inode;
316 struct super_block * sb;
317 kdev_t dev;
319 /* sync the inode to buffers */
320 write_inode_now(inode);
322 /* sync the superblock to buffers */
323 sb = inode->i_sb;
324 wait_on_super(sb);
325 if (sb->s_op && sb->s_op->write_super)
326 sb->s_op->write_super(sb);
328 /* .. finally sync the buffers to disk */
329 dev = inode->i_dev;
330 return sync_buffers(dev, 1);
333 asmlinkage int sys_fsync(unsigned int fd)
335 struct file * file;
336 struct dentry * dentry;
337 struct inode * inode;
338 int err;
340 lock_kernel();
341 err = -EBADF;
342 file = fget(fd);
343 if (!file)
344 goto out;
346 dentry = file->f_dentry;
347 if (!dentry)
348 goto out_putf;
350 inode = dentry->d_inode;
351 if (!inode)
352 goto out_putf;
354 err = -EINVAL;
355 if (!file->f_op || !file->f_op->fsync)
356 goto out_putf;
358 /* We need to protect against concurrent writers.. */
359 down(&inode->i_sem);
360 err = file->f_op->fsync(file, dentry);
361 up(&inode->i_sem);
363 out_putf:
364 fput(file);
365 out:
366 unlock_kernel();
367 return err;
370 asmlinkage int sys_fdatasync(unsigned int fd)
372 struct file * file;
373 struct dentry * dentry;
374 struct inode * inode;
375 int err;
377 lock_kernel();
378 err = -EBADF;
379 file = fget(fd);
380 if (!file)
381 goto out;
383 dentry = file->f_dentry;
384 if (!dentry)
385 goto out_putf;
387 inode = dentry->d_inode;
388 if (!inode)
389 goto out_putf;
391 err = -EINVAL;
392 if (!file->f_op || !file->f_op->fsync)
393 goto out_putf;
395 /* this needs further work, at the moment it is identical to fsync() */
396 err = file->f_op->fsync(file, dentry);
398 out_putf:
399 fput(file);
400 out:
401 unlock_kernel();
402 return err;
405 void invalidate_buffers(kdev_t dev)
407 int i;
408 int nlist;
409 struct buffer_head * bh;
411 for(nlist = 0; nlist < NR_LIST; nlist++) {
412 bh = lru_list[nlist];
413 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
414 if (bh->b_dev != dev)
415 continue;
416 wait_on_buffer(bh);
417 if (bh->b_dev != dev)
418 continue;
419 if (bh->b_count)
420 continue;
421 bh->b_flushtime = 0;
422 clear_bit(BH_Protected, &bh->b_state);
423 clear_bit(BH_Uptodate, &bh->b_state);
424 clear_bit(BH_Dirty, &bh->b_state);
425 clear_bit(BH_Req, &bh->b_state);
430 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
431 #define hash(dev,block) hash_table[_hashfn(dev,block)]
433 static inline void remove_from_hash_queue(struct buffer_head * bh)
435 if (bh->b_pprev) {
436 if(bh->b_next)
437 bh->b_next->b_pprev = bh->b_pprev;
438 *bh->b_pprev = bh->b_next;
439 bh->b_pprev = NULL;
443 static inline void remove_from_lru_list(struct buffer_head * bh)
445 if (!(bh->b_prev_free) || !(bh->b_next_free))
446 panic("VFS: LRU block list corrupted");
447 if (bh->b_dev == B_FREE)
448 panic("LRU list corrupted");
449 bh->b_prev_free->b_next_free = bh->b_next_free;
450 bh->b_next_free->b_prev_free = bh->b_prev_free;
452 if (lru_list[bh->b_list] == bh)
453 lru_list[bh->b_list] = bh->b_next_free;
454 if (lru_list[bh->b_list] == bh)
455 lru_list[bh->b_list] = NULL;
456 bh->b_next_free = bh->b_prev_free = NULL;
459 static inline void remove_from_free_list(struct buffer_head * bh)
461 int isize = BUFSIZE_INDEX(bh->b_size);
462 if (!(bh->b_prev_free) || !(bh->b_next_free))
463 panic("VFS: Free block list corrupted");
464 if(bh->b_dev != B_FREE)
465 panic("Free list corrupted");
466 if(!free_list[isize])
467 panic("Free list empty");
468 if(bh->b_next_free == bh)
469 free_list[isize] = NULL;
470 else {
471 bh->b_prev_free->b_next_free = bh->b_next_free;
472 bh->b_next_free->b_prev_free = bh->b_prev_free;
473 if (free_list[isize] == bh)
474 free_list[isize] = bh->b_next_free;
476 bh->b_next_free = bh->b_prev_free = NULL;
479 static inline void remove_from_queues(struct buffer_head * bh)
481 if(bh->b_dev == B_FREE) {
482 remove_from_free_list(bh); /* Free list entries should not be
483 in the hash queue */
484 return;
486 nr_buffers_type[bh->b_list]--;
487 remove_from_hash_queue(bh);
488 remove_from_lru_list(bh);
491 static inline void put_last_lru(struct buffer_head * bh)
493 if (bh) {
494 struct buffer_head **bhp = &lru_list[bh->b_list];
496 if (bh == *bhp) {
497 *bhp = bh->b_next_free;
498 return;
501 if(bh->b_dev == B_FREE)
502 panic("Wrong block for lru list");
504 /* Add to back of free list. */
505 remove_from_lru_list(bh);
506 if(!*bhp) {
507 *bhp = bh;
508 (*bhp)->b_prev_free = bh;
511 bh->b_next_free = *bhp;
512 bh->b_prev_free = (*bhp)->b_prev_free;
513 (*bhp)->b_prev_free->b_next_free = bh;
514 (*bhp)->b_prev_free = bh;
518 static inline void put_last_free(struct buffer_head * bh)
520 if (bh) {
521 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
523 bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
525 /* Add to back of free list. */
526 if(!*bhp) {
527 *bhp = bh;
528 bh->b_prev_free = bh;
531 bh->b_next_free = *bhp;
532 bh->b_prev_free = (*bhp)->b_prev_free;
533 (*bhp)->b_prev_free->b_next_free = bh;
534 (*bhp)->b_prev_free = bh;
538 static inline void insert_into_queues(struct buffer_head * bh)
540 /* put at end of free list */
541 if(bh->b_dev == B_FREE) {
542 put_last_free(bh);
543 } else {
544 struct buffer_head **bhp = &lru_list[bh->b_list];
546 if(!*bhp) {
547 *bhp = bh;
548 bh->b_prev_free = bh;
551 if (bh->b_next_free)
552 panic("VFS: buffer LRU pointers corrupted");
554 bh->b_next_free = *bhp;
555 bh->b_prev_free = (*bhp)->b_prev_free;
556 (*bhp)->b_prev_free->b_next_free = bh;
557 (*bhp)->b_prev_free = bh;
559 nr_buffers_type[bh->b_list]++;
561 /* Put the buffer in new hash-queue if it has a device. */
562 if (bh->b_dev) {
563 struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
564 if((bh->b_next = *bhp) != NULL)
565 (*bhp)->b_pprev = &bh->b_next;
566 *bhp = bh;
567 bh->b_pprev = bhp; /* Exists in bh hashes. */
568 } else
569 bh->b_pprev = NULL; /* Not in bh hashes. */
573 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
575 struct buffer_head * next;
577 next = hash(dev,block);
578 for (;;) {
579 struct buffer_head *tmp = next;
580 if (!next)
581 break;
582 next = tmp->b_next;
583 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
584 continue;
585 next = tmp;
586 break;
588 return next;
592 * Why like this, I hear you say... The reason is race-conditions.
593 * As we don't lock buffers (unless we are reading them, that is),
594 * something might happen to it while we sleep (ie a read-error
595 * will force it bad). This shouldn't really happen currently, but
596 * the code is ready.
598 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
600 struct buffer_head * bh;
601 for (;;) {
602 bh = find_buffer(dev,block,size);
603 if (!bh)
604 break;
605 bh->b_count++;
606 bh->b_lru_time = jiffies;
607 if (!buffer_locked(bh))
608 break;
609 __wait_on_buffer(bh);
610 if (bh->b_dev == dev &&
611 bh->b_blocknr == block &&
612 bh->b_size == size)
613 break;
614 bh->b_count--;
616 return bh;
619 unsigned int get_hardblocksize(kdev_t dev)
622 * Get the hard sector size for the given device. If we don't know
623 * what it is, return 0.
625 if (hardsect_size[MAJOR(dev)] != NULL) {
626 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
627 if (blksize != 0)
628 return blksize;
632 * We don't know what the hardware sector size for this device is.
633 * Return 0 indicating that we don't know.
635 return 0;
638 void set_blocksize(kdev_t dev, int size)
640 extern int *blksize_size[];
641 int i, nlist;
642 struct buffer_head * bh, *bhnext;
644 if (!blksize_size[MAJOR(dev)])
645 return;
647 /* Size must be a power of two, and between 512 and PAGE_SIZE */
648 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
649 panic("Invalid blocksize passed to set_blocksize");
651 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
652 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
653 return;
655 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
656 return;
657 sync_buffers(dev, 2);
658 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
660 /* We need to be quite careful how we do this - we are moving entries
661 * around on the free list, and we can get in a loop if we are not careful.
663 for(nlist = 0; nlist < NR_LIST; nlist++) {
664 bh = lru_list[nlist];
665 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
666 if(!bh)
667 break;
669 bhnext = bh->b_next_free;
670 if (bh->b_dev != dev)
671 continue;
672 if (bh->b_size == size)
673 continue;
674 bhnext->b_count++;
675 wait_on_buffer(bh);
676 bhnext->b_count--;
677 if (bh->b_dev == dev && bh->b_size != size) {
678 clear_bit(BH_Dirty, &bh->b_state);
679 clear_bit(BH_Uptodate, &bh->b_state);
680 clear_bit(BH_Req, &bh->b_state);
681 bh->b_flushtime = 0;
683 remove_from_hash_queue(bh);
689 * Find a candidate buffer to be reclaimed.
690 * N.B. Must search the entire BUF_LOCKED list rather than terminating
691 * when the first locked buffer is found. Buffers are unlocked at
692 * completion of IO, and under some conditions there may be (many)
693 * unlocked buffers after the first locked one.
695 static struct buffer_head *find_candidate(struct buffer_head *bh,
696 int *list_len, int size)
698 if (!bh)
699 goto no_candidate;
701 for (; (*list_len) > 0; bh = bh->b_next_free, (*list_len)--) {
702 if (size != bh->b_size) {
703 /* This provides a mechanism for freeing blocks
704 * of other sizes, this is necessary now that we
705 * no longer have the lav code.
707 try_to_free_buffer(bh,&bh,1);
708 if (!bh)
709 break;
710 continue;
712 else if (!bh->b_count &&
713 !buffer_locked(bh) &&
714 !buffer_protected(bh) &&
715 !buffer_dirty(bh))
716 return bh;
719 no_candidate:
720 return NULL;
723 static void refill_freelist(int size)
725 struct buffer_head * bh, * next;
726 struct buffer_head * candidate[BUF_DIRTY];
727 int buffers[BUF_DIRTY];
728 int i;
729 int needed, obtained=0;
731 refilled = 1;
733 /* We are going to try to locate this much memory. */
734 needed = bdf_prm.b_un.nrefill * size;
736 while ((nr_free_pages > freepages.min*2) &&
737 (buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.max_percent * num_physpages) &&
738 grow_buffers(GFP_BUFFER, size)) {
739 obtained += PAGE_SIZE;
740 if (obtained >= needed)
741 return;
745 * Update the needed amount based on the number of potentially
746 * freeable buffers. We don't want to free more than one quarter
747 * of the available buffers.
749 i = (nr_buffers_type[BUF_CLEAN] + nr_buffers_type[BUF_LOCKED]) >> 2;
750 if (i < bdf_prm.b_un.nrefill) {
751 needed = i * size;
752 if (needed < PAGE_SIZE)
753 needed = PAGE_SIZE;
757 * OK, we cannot grow the buffer cache, now try to get some
758 * from the lru list.
760 repeat:
761 if (obtained >= needed)
762 return;
765 * First set the candidate pointers to usable buffers. This
766 * should be quick nearly all of the time. N.B. There must be
767 * no blocking calls after setting up the candidate[] array!
769 for (i = BUF_CLEAN; i<BUF_DIRTY; i++) {
770 buffers[i] = nr_buffers_type[i];
771 candidate[i] = find_candidate(lru_list[i], &buffers[i], size);
775 * Select the older of the available buffers until we reach our goal.
777 for (;;) {
778 i = BUF_CLEAN;
779 if (!candidate[BUF_CLEAN]) {
780 if (!candidate[BUF_LOCKED])
781 break;
782 i = BUF_LOCKED;
784 else if (candidate[BUF_LOCKED] &&
785 (candidate[BUF_LOCKED]->b_lru_time <
786 candidate[BUF_CLEAN ]->b_lru_time))
787 i = BUF_LOCKED;
789 * Free the selected buffer and get the next candidate.
791 bh = candidate[i];
792 next = bh->b_next_free;
794 obtained += bh->b_size;
795 remove_from_queues(bh);
796 put_last_free(bh);
797 if (obtained >= needed)
798 return;
800 if (--buffers[i] && bh != next)
801 candidate[i] = find_candidate(next, &buffers[i], size);
802 else
803 candidate[i] = NULL;
807 * If there are dirty buffers, do a non-blocking wake-up.
808 * This increases the chances of having buffers available
809 * for the next call ...
811 if (nr_buffers_type[BUF_DIRTY])
812 wakeup_bdflush(0);
815 * Allocate buffers to reach half our goal, if possible.
816 * Since the allocation doesn't block, there's no reason
817 * to search the buffer lists again. Then return if there
818 * are _any_ free buffers.
820 while (obtained < (needed >> 1) &&
821 nr_free_pages > freepages.min + 5 &&
822 grow_buffers(GFP_BUFFER, size))
823 obtained += PAGE_SIZE;
825 if (free_list[BUFSIZE_INDEX(size)])
826 return;
829 * If there are dirty buffers, wait while bdflush writes
830 * them out. The buffers become locked, but we can just
831 * wait for one to unlock ...
833 if (nr_buffers_type[BUF_DIRTY])
834 wakeup_bdflush(1);
837 * In order to prevent a buffer shortage from exhausting
838 * the system's reserved pages, we force tasks to wait
839 * before using reserved pages for buffers. This is easily
840 * accomplished by waiting on an unused locked buffer.
842 if ((bh = lru_list[BUF_LOCKED]) != NULL) {
843 for (i = nr_buffers_type[BUF_LOCKED]; i--; bh = bh->b_next_free)
845 if (bh->b_size != size)
846 continue;
847 if (bh->b_count)
848 continue;
849 if (!buffer_locked(bh))
850 continue;
851 if (buffer_dirty(bh) || buffer_protected(bh))
852 continue;
853 if (MAJOR(bh->b_dev) == LOOP_MAJOR)
854 continue;
856 * We've found an unused, locked, non-dirty buffer of
857 * the correct size. Claim it so no one else can,
858 * then wait for it to unlock.
860 bh->b_count++;
861 wait_on_buffer(bh);
862 bh->b_count--;
864 * Loop back to harvest this (and maybe other) buffers.
866 goto repeat;
871 * Convert a reserved page into buffers ... should happen only rarely.
873 if (grow_buffers(GFP_ATOMIC, size)) {
874 #ifdef BUFFER_DEBUG
875 printk("refill_freelist: used reserve page\n");
876 #endif
877 return;
881 * System is _very_ low on memory ... sleep and try later.
883 #ifdef BUFFER_DEBUG
884 printk("refill_freelist: task %s waiting for buffers\n", current->comm);
885 #endif
886 schedule();
887 goto repeat;
890 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
891 bh_end_io_t *handler, void *dev_id)
893 bh->b_count = 1;
894 bh->b_list = BUF_CLEAN;
895 bh->b_flushtime = 0;
896 bh->b_dev = dev;
897 bh->b_blocknr = block;
898 bh->b_end_io = handler;
899 bh->b_dev_id = dev_id;
902 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
904 mark_buffer_uptodate(bh, uptodate);
905 unlock_buffer(bh);
909 * Ok, this is getblk, and it isn't very clear, again to hinder
910 * race-conditions. Most of the code is seldom used, (ie repeating),
911 * so it should be much more efficient than it looks.
913 * The algorithm is changed: hopefully better, and an elusive bug removed.
915 * 14.02.92: changed it to sync dirty buffers a bit: better performance
916 * when the filesystem starts to get full of dirty blocks (I hope).
918 struct buffer_head * getblk(kdev_t dev, int block, int size)
920 struct buffer_head * bh;
921 int isize;
923 repeat:
924 bh = get_hash_table(dev, block, size);
925 if (bh) {
926 if (!buffer_dirty(bh)) {
927 if (buffer_uptodate(bh))
928 put_last_lru(bh);
929 bh->b_flushtime = 0;
931 set_bit(BH_Touched, &bh->b_state);
932 return bh;
935 isize = BUFSIZE_INDEX(size);
936 get_free:
937 bh = free_list[isize];
938 if (!bh)
939 goto refill;
940 remove_from_free_list(bh);
942 /* OK, FINALLY we know that this buffer is the only one of its kind,
943 * and that it's unused (b_count=0), unlocked, and clean.
945 init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
946 bh->b_lru_time = jiffies;
947 bh->b_state=(1<<BH_Touched);
948 insert_into_queues(bh);
949 return bh;
952 * If we block while refilling the free list, somebody may
953 * create the buffer first ... search the hashes again.
955 refill:
956 refill_freelist(size);
957 if (!find_buffer(dev,block,size))
958 goto get_free;
959 goto repeat;
962 void set_writetime(struct buffer_head * buf, int flag)
964 int newtime;
966 if (buffer_dirty(buf)) {
967 /* Move buffer to dirty list if jiffies is clear. */
968 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
969 bdf_prm.b_un.age_buffer);
970 if(!buf->b_flushtime || buf->b_flushtime > newtime)
971 buf->b_flushtime = newtime;
972 } else {
973 buf->b_flushtime = 0;
979 * Put a buffer into the appropriate list, without side-effects.
981 static inline void file_buffer(struct buffer_head *bh, int list)
983 remove_from_queues(bh);
984 bh->b_list = list;
985 insert_into_queues(bh);
989 * A buffer may need to be moved from one buffer list to another
990 * (e.g. in case it is not shared any more). Handle this.
992 void refile_buffer(struct buffer_head * buf)
994 int dispose;
996 if(buf->b_dev == B_FREE) {
997 printk("Attempt to refile free buffer\n");
998 return;
1000 if (buffer_dirty(buf))
1001 dispose = BUF_DIRTY;
1002 else if (buffer_locked(buf))
1003 dispose = BUF_LOCKED;
1004 else
1005 dispose = BUF_CLEAN;
1006 if(dispose != buf->b_list) {
1007 file_buffer(buf, dispose);
1008 if(dispose == BUF_DIRTY) {
1009 int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
1011 /* This buffer is dirty, maybe we need to start flushing.
1012 * If too high a percentage of the buffers are dirty...
1014 if (nr_buffers_type[BUF_DIRTY] > too_many)
1015 wakeup_bdflush(0);
1017 /* If this is a loop device, and
1018 * more than half of the buffers are dirty...
1019 * (Prevents no-free-buffers deadlock with loop device.)
1021 if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
1022 nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
1023 wakeup_bdflush(1);
1029 * Release a buffer head
1031 void __brelse(struct buffer_head * buf)
1033 wait_on_buffer(buf);
1035 /* If dirty, mark the time this buffer should be written back. */
1036 set_writetime(buf, 0);
1037 refile_buffer(buf);
1039 if (buf->b_count) {
1040 buf->b_count--;
1041 return;
1043 printk("VFS: brelse: Trying to free free buffer\n");
1047 * bforget() is like brelse(), except it removes the buffer
1048 * from the hash-queues (so that it won't be re-used if it's
1049 * shared).
1051 void __bforget(struct buffer_head * buf)
1053 wait_on_buffer(buf);
1054 mark_buffer_clean(buf);
1055 clear_bit(BH_Protected, &buf->b_state);
1056 remove_from_hash_queue(buf);
1057 buf->b_dev = NODEV;
1058 refile_buffer(buf);
1059 if (!--buf->b_count)
1060 return;
1061 printk("VFS: forgot an in-use buffer! (count=%d)\n",
1062 buf->b_count);
1066 * bread() reads a specified block and returns the buffer that contains
1067 * it. It returns NULL if the block was unreadable.
1069 struct buffer_head * bread(kdev_t dev, int block, int size)
1071 struct buffer_head * bh = getblk(dev, block, size);
1073 if (bh) {
1074 if (buffer_uptodate(bh))
1075 return bh;
1076 ll_rw_block(READ, 1, &bh);
1077 wait_on_buffer(bh);
1078 if (buffer_uptodate(bh))
1079 return bh;
1080 brelse(bh);
1081 return NULL;
1083 printk("VFS: bread: impossible error\n");
1084 return NULL;
1088 * Ok, breada can be used as bread, but additionally to mark other
1089 * blocks for reading as well. End the argument list with a negative
1090 * number.
1093 #define NBUF 16
1095 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1096 unsigned int pos, unsigned int filesize)
1098 struct buffer_head * bhlist[NBUF];
1099 unsigned int blocks;
1100 struct buffer_head * bh;
1101 int index;
1102 int i, j;
1104 if (pos >= filesize)
1105 return NULL;
1107 if (block < 0 || !(bh = getblk(dev,block,bufsize)))
1108 return NULL;
1110 index = BUFSIZE_INDEX(bh->b_size);
1112 if (buffer_uptodate(bh))
1113 return(bh);
1114 else ll_rw_block(READ, 1, &bh);
1116 blocks = (filesize - pos) >> (9+index);
1118 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1119 blocks = read_ahead[MAJOR(dev)] >> index;
1120 if (blocks > NBUF)
1121 blocks = NBUF;
1123 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1126 bhlist[0] = bh;
1127 j = 1;
1128 for(i=1; i<blocks; i++) {
1129 bh = getblk(dev,block+i,bufsize);
1130 if (buffer_uptodate(bh)) {
1131 brelse(bh);
1132 break;
1134 else bhlist[j++] = bh;
1137 /* Request the read for these buffers, and then release them. */
1138 if (j>1)
1139 ll_rw_block(READA, (j-1), bhlist+1);
1140 for(i=1; i<j; i++)
1141 brelse(bhlist[i]);
1143 /* Wait for this buffer, and then continue on. */
1144 bh = bhlist[0];
1145 wait_on_buffer(bh);
1146 if (buffer_uptodate(bh))
1147 return bh;
1148 brelse(bh);
1149 return NULL;
1153 * Note: the caller should wake up the buffer_wait list if needed.
1155 static void put_unused_buffer_head(struct buffer_head * bh)
1157 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1158 nr_buffer_heads--;
1159 kmem_cache_free(bh_cachep, bh);
1160 return;
1163 memset(bh,0,sizeof(*bh));
1164 nr_unused_buffer_heads++;
1165 bh->b_next_free = unused_list;
1166 unused_list = bh;
1170 * We can't put completed temporary IO buffer_heads directly onto the
1171 * unused_list when they become unlocked, since the device driver
1172 * end_request routines still expect access to the buffer_head's
1173 * fields after the final unlock. So, the device driver puts them on
1174 * the reuse_list instead once IO completes, and we recover these to
1175 * the unused_list here.
1177 * Note that we don't do a wakeup here, but return a flag indicating
1178 * whether we got any buffer heads. A task ready to sleep can check
1179 * the returned value, and any tasks already sleeping will have been
1180 * awakened when the buffer heads were added to the reuse list.
1182 static inline int recover_reusable_buffer_heads(void)
1184 struct buffer_head *head = xchg(&reuse_list, NULL);
1185 int found = 0;
1187 if (head) {
1188 do {
1189 struct buffer_head *bh = head;
1190 head = head->b_next_free;
1191 put_unused_buffer_head(bh);
1192 } while (head);
1193 found = 1;
1195 return found;
1199 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1200 * no-buffer-head deadlock. Return NULL on failure; waiting for
1201 * buffer heads is now handled in create_buffers().
1203 static struct buffer_head * get_unused_buffer_head(int async)
1205 struct buffer_head * bh;
1207 recover_reusable_buffer_heads();
1208 if (nr_unused_buffer_heads > NR_RESERVED) {
1209 bh = unused_list;
1210 unused_list = bh->b_next_free;
1211 nr_unused_buffer_heads--;
1212 return bh;
1215 /* This is critical. We can't swap out pages to get
1216 * more buffer heads, because the swap-out may need
1217 * more buffer-heads itself. Thus SLAB_ATOMIC.
1219 if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
1220 memset(bh, 0, sizeof(*bh));
1221 nr_buffer_heads++;
1222 return bh;
1226 * If we need an async buffer, use the reserved buffer heads.
1228 if (async && unused_list) {
1229 bh = unused_list;
1230 unused_list = bh->b_next_free;
1231 nr_unused_buffer_heads--;
1232 return bh;
1235 #if 0
1237 * (Pending further analysis ...)
1238 * Ordinary (non-async) requests can use a different memory priority
1239 * to free up pages. Any swapping thus generated will use async
1240 * buffer heads.
1242 if(!async &&
1243 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1244 memset(bh, 0, sizeof(*bh));
1245 nr_buffer_heads++;
1246 return bh;
1248 #endif
1250 return NULL;
1254 * Create the appropriate buffers when given a page for data area and
1255 * the size of each buffer.. Use the bh->b_this_page linked list to
1256 * follow the buffers created. Return NULL if unable to create more
1257 * buffers.
1258 * The async flag is used to differentiate async IO (paging, swapping)
1259 * from ordinary buffer allocations, and only async requests are allowed
1260 * to sleep waiting for buffer heads.
1262 static struct buffer_head * create_buffers(unsigned long page,
1263 unsigned long size, int async)
1265 struct wait_queue wait = { current, NULL };
1266 struct buffer_head *bh, *head;
1267 long offset;
1269 try_again:
1270 head = NULL;
1271 offset = PAGE_SIZE;
1272 while ((offset -= size) >= 0) {
1273 bh = get_unused_buffer_head(async);
1274 if (!bh)
1275 goto no_grow;
1277 bh->b_dev = B_FREE; /* Flag as unused */
1278 bh->b_this_page = head;
1279 head = bh;
1281 bh->b_state = 0;
1282 bh->b_next_free = NULL;
1283 bh->b_count = 0;
1284 bh->b_size = size;
1286 bh->b_data = (char *) (page+offset);
1287 bh->b_list = 0;
1289 return head;
1291 * In case anything failed, we just free everything we got.
1293 no_grow:
1294 if (head) {
1295 do {
1296 bh = head;
1297 head = head->b_this_page;
1298 put_unused_buffer_head(bh);
1299 } while (head);
1301 /* Wake up any waiters ... */
1302 wake_up(&buffer_wait);
1306 * Return failure for non-async IO requests. Async IO requests
1307 * are not allowed to fail, so we have to wait until buffer heads
1308 * become available. But we don't want tasks sleeping with
1309 * partially complete buffers, so all were released above.
1311 if (!async)
1312 return NULL;
1314 /* We're _really_ low on memory. Now we just
1315 * wait for old buffer heads to become free due to
1316 * finishing IO. Since this is an async request and
1317 * the reserve list is empty, we're sure there are
1318 * async buffer heads in use.
1320 run_task_queue(&tq_disk);
1323 * Set our state for sleeping, then check again for buffer heads.
1324 * This ensures we won't miss a wake_up from an interrupt.
1326 add_wait_queue(&buffer_wait, &wait);
1327 current->state = TASK_UNINTERRUPTIBLE;
1328 if (!recover_reusable_buffer_heads())
1329 schedule();
1330 remove_wait_queue(&buffer_wait, &wait);
1331 current->state = TASK_RUNNING;
1332 goto try_again;
1335 /* Run the hooks that have to be done when a page I/O has completed. */
1336 static inline void after_unlock_page (struct page * page)
1338 if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1339 atomic_dec(&nr_async_pages);
1340 #ifdef DEBUG_SWAP
1341 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1342 (char *) page_address(page),
1343 atomic_read(&nr_async_pages));
1344 #endif
1346 if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1347 swap_after_unlock_page(page->offset);
1348 if (test_and_clear_bit(PG_free_after, &page->flags))
1349 __free_page(page);
1353 * Free all temporary buffers belonging to a page.
1354 * This needs to be called with interrupts disabled.
1356 static inline void free_async_buffers (struct buffer_head * bh)
1358 struct buffer_head *tmp, *tail;
1361 * Link all the buffers into the b_next_free list,
1362 * so we only have to do one xchg() operation ...
1364 tail = bh;
1365 while ((tmp = tail->b_this_page) != bh) {
1366 tail->b_next_free = tmp;
1367 tail = tmp;
1370 /* Update the reuse list */
1371 tail->b_next_free = xchg(&reuse_list, NULL);
1372 reuse_list = bh;
1374 /* Wake up any waiters ... */
1375 wake_up(&buffer_wait);
1378 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1380 unsigned long flags;
1381 struct buffer_head *tmp;
1382 struct page *page;
1384 mark_buffer_uptodate(bh, uptodate);
1385 unlock_buffer(bh);
1387 /* This is a temporary buffer used for page I/O. */
1388 page = mem_map + MAP_NR(bh->b_data);
1389 if (!PageLocked(page))
1390 goto not_locked;
1391 if (bh->b_count != 1)
1392 goto bad_count;
1394 if (!test_bit(BH_Uptodate, &bh->b_state))
1395 set_bit(PG_error, &page->flags);
1398 * Be _very_ careful from here on. Bad things can happen if
1399 * two buffer heads end IO at almost the same time and both
1400 * decide that the page is now completely done.
1402 * Async buffer_heads are here only as labels for IO, and get
1403 * thrown away once the IO for this page is complete. IO is
1404 * deemed complete once all buffers have been visited
1405 * (b_count==0) and are now unlocked. We must make sure that
1406 * only the _last_ buffer that decrements its count is the one
1407 * that free's the page..
1409 save_flags(flags);
1410 cli();
1411 bh->b_count--;
1412 tmp = bh;
1413 do {
1414 if (tmp->b_count)
1415 goto still_busy;
1416 tmp = tmp->b_this_page;
1417 } while (tmp != bh);
1419 /* OK, the async IO on this page is complete. */
1420 free_async_buffers(bh);
1421 restore_flags(flags);
1422 clear_bit(PG_locked, &page->flags);
1423 wake_up(&page->wait);
1424 after_unlock_page(page);
1425 return;
1427 still_busy:
1428 restore_flags(flags);
1429 return;
1431 not_locked:
1432 printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1433 return;
1435 bad_count:
1436 printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1437 return;
1441 * Start I/O on a page.
1442 * This function expects the page to be locked and may return before I/O is complete.
1443 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1445 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1447 struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1448 int block, nr;
1450 if (!PageLocked(page))
1451 panic("brw_page: page not locked for I/O");
1452 clear_bit(PG_uptodate, &page->flags);
1453 clear_bit(PG_error, &page->flags);
1455 * Allocate async buffer heads pointing to this page, just for I/O.
1456 * They do _not_ show up in the buffer hash table!
1457 * They are _not_ registered in page->buffers either!
1459 bh = create_buffers(page_address(page), size, 1);
1460 if (!bh) {
1461 /* WSH: exit here leaves page->count incremented */
1462 clear_bit(PG_locked, &page->flags);
1463 wake_up(&page->wait);
1464 return -ENOMEM;
1466 nr = 0;
1467 next = bh;
1468 do {
1469 struct buffer_head * tmp;
1470 block = *(b++);
1472 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1473 set_bit(BH_Uptodate, &next->b_state);
1476 * When we use bmap, we define block zero to represent
1477 * a hole. ll_rw_page, however, may legitimately
1478 * access block zero, and we need to distinguish the
1479 * two cases.
1481 if (bmap && !block) {
1482 memset(next->b_data, 0, size);
1483 next->b_count--;
1484 continue;
1486 tmp = get_hash_table(dev, block, size);
1487 if (tmp) {
1488 if (!buffer_uptodate(tmp)) {
1489 if (rw == READ)
1490 ll_rw_block(READ, 1, &tmp);
1491 wait_on_buffer(tmp);
1493 if (rw == READ)
1494 memcpy(next->b_data, tmp->b_data, size);
1495 else {
1496 memcpy(tmp->b_data, next->b_data, size);
1497 mark_buffer_dirty(tmp, 0);
1499 brelse(tmp);
1500 next->b_count--;
1501 continue;
1503 if (rw == READ)
1504 clear_bit(BH_Uptodate, &next->b_state);
1505 else
1506 set_bit(BH_Dirty, &next->b_state);
1507 arr[nr++] = next;
1508 } while (prev = next, (next = next->b_this_page) != NULL);
1509 prev->b_this_page = bh;
1511 if (nr) {
1512 ll_rw_block(rw, nr, arr);
1513 /* The rest of the work is done in mark_buffer_uptodate()
1514 * and unlock_buffer(). */
1515 } else {
1516 unsigned long flags;
1517 clear_bit(PG_locked, &page->flags);
1518 set_bit(PG_uptodate, &page->flags);
1519 wake_up(&page->wait);
1520 save_flags(flags);
1521 cli();
1522 free_async_buffers(bh);
1523 restore_flags(flags);
1524 after_unlock_page(page);
1526 ++current->maj_flt;
1527 return 0;
1531 * This is called by end_request() when I/O has completed.
1533 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1535 if (on) {
1536 struct buffer_head *tmp = bh;
1537 set_bit(BH_Uptodate, &bh->b_state);
1538 /* If a page has buffers and all these buffers are uptodate,
1539 * then the page is uptodate. */
1540 do {
1541 if (!test_bit(BH_Uptodate, &tmp->b_state))
1542 return;
1543 tmp=tmp->b_this_page;
1544 } while (tmp && tmp != bh);
1545 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1546 return;
1548 clear_bit(BH_Uptodate, &bh->b_state);
1552 * Generic "readpage" function for block devices that have the normal
1553 * bmap functionality. This is most of the block device filesystems.
1554 * Reads the page asynchronously --- the unlock_buffer() and
1555 * mark_buffer_uptodate() functions propagate buffer state into the
1556 * page struct once IO has completed.
1558 int generic_readpage(struct file * file, struct page * page)
1560 struct dentry *dentry = file->f_dentry;
1561 struct inode *inode = dentry->d_inode;
1562 unsigned long block;
1563 int *p, nr[PAGE_SIZE/512];
1564 int i;
1566 atomic_inc(&page->count);
1567 set_bit(PG_locked, &page->flags);
1568 set_bit(PG_free_after, &page->flags);
1570 i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1571 block = page->offset >> inode->i_sb->s_blocksize_bits;
1572 p = nr;
1573 do {
1574 *p = inode->i_op->bmap(inode, block);
1575 i--;
1576 block++;
1577 p++;
1578 } while (i > 0);
1580 /* IO start */
1581 brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1582 return 0;
1586 * Try to increase the number of buffers available: the size argument
1587 * is used to determine what kind of buffers we want.
1589 static int grow_buffers(int pri, int size)
1591 unsigned long page;
1592 struct buffer_head *bh, *tmp;
1593 struct buffer_head * insert_point;
1594 int isize;
1596 if ((size & 511) || (size > PAGE_SIZE)) {
1597 printk("VFS: grow_buffers: size = %d\n",size);
1598 return 0;
1601 if (!(page = __get_free_page(pri)))
1602 return 0;
1603 bh = create_buffers(page, size, 0);
1604 if (!bh) {
1605 free_page(page);
1606 return 0;
1609 isize = BUFSIZE_INDEX(size);
1610 insert_point = free_list[isize];
1612 tmp = bh;
1613 while (1) {
1614 if (insert_point) {
1615 tmp->b_next_free = insert_point->b_next_free;
1616 tmp->b_prev_free = insert_point;
1617 insert_point->b_next_free->b_prev_free = tmp;
1618 insert_point->b_next_free = tmp;
1619 } else {
1620 tmp->b_prev_free = tmp;
1621 tmp->b_next_free = tmp;
1623 insert_point = tmp;
1624 ++nr_buffers;
1625 if (tmp->b_this_page)
1626 tmp = tmp->b_this_page;
1627 else
1628 break;
1630 tmp->b_this_page = bh;
1631 free_list[isize] = bh;
1632 mem_map[MAP_NR(page)].buffers = bh;
1633 buffermem += PAGE_SIZE;
1634 return 1;
1638 /* =========== Reduce the buffer memory ============= */
1640 static inline int buffer_waiting(struct buffer_head * bh)
1642 return waitqueue_active(&bh->b_wait);
1646 * try_to_free_buffer() checks if all the buffers on this particular page
1647 * are unused, and free's the page if so.
1649 int try_to_free_buffer(struct buffer_head * bh, struct buffer_head ** bhp,
1650 int priority)
1652 unsigned long page;
1653 struct buffer_head * tmp, * p;
1655 *bhp = bh;
1656 page = (unsigned long) bh->b_data;
1657 page &= PAGE_MASK;
1658 tmp = bh;
1659 do {
1660 if (!tmp)
1661 return 0;
1662 if (tmp->b_count || buffer_protected(tmp) ||
1663 buffer_dirty(tmp) || buffer_locked(tmp) ||
1664 buffer_waiting(tmp))
1665 return 0;
1666 if (priority && buffer_touched(tmp))
1667 return 0;
1668 tmp = tmp->b_this_page;
1669 } while (tmp != bh);
1671 tmp = bh;
1672 do {
1673 p = tmp;
1674 tmp = tmp->b_this_page;
1675 nr_buffers--;
1676 if (p == *bhp) {
1677 *bhp = p->b_prev_free;
1678 if (p == *bhp) /* Was this the last in the list? */
1679 *bhp = NULL;
1681 remove_from_queues(p);
1682 put_unused_buffer_head(p);
1683 } while (tmp != bh);
1684 /* Wake up anyone waiting for buffer heads */
1685 wake_up(&buffer_wait);
1687 buffermem -= PAGE_SIZE;
1688 mem_map[MAP_NR(page)].buffers = NULL;
1689 free_page(page);
1690 return 1;
1693 /* ================== Debugging =================== */
1695 void show_buffers(void)
1697 struct buffer_head * bh;
1698 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1699 int protected = 0;
1700 int nlist;
1701 static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1703 printk("Buffer memory: %6dkB\n",buffermem>>10);
1704 printk("Buffer heads: %6d\n",nr_buffer_heads);
1705 printk("Buffer blocks: %6d\n",nr_buffers);
1707 for(nlist = 0; nlist < NR_LIST; nlist++) {
1708 found = locked = dirty = used = lastused = protected = 0;
1709 bh = lru_list[nlist];
1710 if(!bh) continue;
1712 do {
1713 found++;
1714 if (buffer_locked(bh))
1715 locked++;
1716 if (buffer_protected(bh))
1717 protected++;
1718 if (buffer_dirty(bh))
1719 dirty++;
1720 if (bh->b_count)
1721 used++, lastused = found;
1722 bh = bh->b_next_free;
1723 } while (bh != lru_list[nlist]);
1724 printk("%8s: %d buffers, %d used (last=%d), "
1725 "%d locked, %d protected, %d dirty\n",
1726 buf_types[nlist], found, used, lastused,
1727 locked, protected, dirty);
1732 /* ===================== Init ======================= */
1735 * allocate the hash table and init the free list
1736 * Use gfp() for the hash table to decrease TLB misses, use
1737 * SLAB cache for buffer heads.
1739 void __init buffer_init(void)
1741 int order = 5; /* Currently maximum order.. */
1742 unsigned int nr_hash;
1744 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct buffer_head *);
1745 hash_table = (struct buffer_head **) __get_free_pages(GFP_ATOMIC, order);
1747 if (!hash_table)
1748 panic("Failed to allocate buffer hash table\n");
1749 memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1750 bh_hash_mask = nr_hash-1;
1752 bh_cachep = kmem_cache_create("buffer_head",
1753 sizeof(struct buffer_head),
1755 SLAB_HWCACHE_ALIGN, NULL, NULL);
1756 if(!bh_cachep)
1757 panic("Cannot create buffer head SLAB cache\n");
1759 * Allocate the reserved buffer heads.
1761 while (nr_buffer_heads < NR_RESERVED) {
1762 struct buffer_head * bh;
1764 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1765 if (!bh)
1766 break;
1767 put_unused_buffer_head(bh);
1768 nr_buffer_heads++;
1771 lru_list[BUF_CLEAN] = 0;
1772 grow_buffers(GFP_KERNEL, BLOCK_SIZE);
1776 /* ====================== bdflush support =================== */
1778 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1779 * response to dirty buffers. Once this process is activated, we write back
1780 * a limited number of buffers to the disks and then go back to sleep again.
1782 static struct wait_queue * bdflush_wait = NULL;
1783 static struct wait_queue * bdflush_done = NULL;
1784 struct task_struct *bdflush_tsk = 0;
1786 void wakeup_bdflush(int wait)
1788 if (current == bdflush_tsk)
1789 return;
1790 wake_up(&bdflush_wait);
1791 if (wait) {
1792 run_task_queue(&tq_disk);
1793 sleep_on(&bdflush_done);
1799 * Here we attempt to write back old buffers. We also try to flush inodes
1800 * and supers as well, since this function is essentially "update", and
1801 * otherwise there would be no way of ensuring that these quantities ever
1802 * get written back. Ideally, we would have a timestamp on the inodes
1803 * and superblocks so that we could write back only the old ones as well
1806 asmlinkage int sync_old_buffers(void)
1808 int i;
1809 int ndirty, nwritten;
1810 int nlist;
1811 int ncount;
1812 struct buffer_head * bh, *next;
1814 sync_supers(0);
1815 sync_inodes(0);
1817 ncount = 0;
1818 #ifdef DEBUG
1819 for(nlist = 0; nlist < NR_LIST; nlist++)
1820 #else
1821 for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1822 #endif
1824 ndirty = 0;
1825 nwritten = 0;
1826 repeat:
1828 bh = lru_list[nlist];
1829 if(bh)
1830 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1831 /* We may have stalled while waiting for I/O to complete. */
1832 if(bh->b_list != nlist) goto repeat;
1833 next = bh->b_next_free;
1834 if(!lru_list[nlist]) {
1835 printk("Dirty list empty %d\n", i);
1836 break;
1839 /* Clean buffer on dirty list? Refile it */
1840 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1842 refile_buffer(bh);
1843 continue;
1846 if (buffer_locked(bh) || !buffer_dirty(bh))
1847 continue;
1848 ndirty++;
1849 if(bh->b_flushtime > jiffies) continue;
1850 nwritten++;
1851 next->b_count++;
1852 bh->b_count++;
1853 bh->b_flushtime = 0;
1854 #ifdef DEBUG
1855 if(nlist != BUF_DIRTY) ncount++;
1856 #endif
1857 ll_rw_block(WRITE, 1, &bh);
1858 bh->b_count--;
1859 next->b_count--;
1862 run_task_queue(&tq_disk);
1863 #ifdef DEBUG
1864 if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1865 printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1866 #endif
1867 run_task_queue(&tq_disk);
1868 return 0;
1872 /* This is the interface to bdflush. As we get more sophisticated, we can
1873 * pass tuning parameters to this "process", to adjust how it behaves.
1874 * We would want to verify each parameter, however, to make sure that it
1875 * is reasonable. */
1877 asmlinkage int sys_bdflush(int func, long data)
1879 int i, error = -EPERM;
1881 lock_kernel();
1882 if (!capable(CAP_SYS_ADMIN))
1883 goto out;
1885 if (func == 1) {
1886 error = sync_old_buffers();
1887 goto out;
1890 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1891 if (func >= 2) {
1892 i = (func-2) >> 1;
1893 error = -EINVAL;
1894 if (i < 0 || i >= N_PARAM)
1895 goto out;
1896 if((func & 1) == 0) {
1897 error = put_user(bdf_prm.data[i], (int*)data);
1898 goto out;
1900 if (data < bdflush_min[i] || data > bdflush_max[i])
1901 goto out;
1902 bdf_prm.data[i] = data;
1903 error = 0;
1904 goto out;
1907 /* Having func 0 used to launch the actual bdflush and then never
1908 * return (unless explicitly killed). We return zero here to
1909 * remain semi-compatible with present update(8) programs.
1911 error = 0;
1912 out:
1913 unlock_kernel();
1914 return error;
1917 /* This is the actual bdflush daemon itself. It used to be started from
1918 * the syscall above, but now we launch it ourselves internally with
1919 * kernel_thread(...) directly after the first thread in init/main.c */
1921 /* To prevent deadlocks for a loop device:
1922 * 1) Do non-blocking writes to loop (avoids deadlock with running
1923 * out of request blocks).
1924 * 2) But do a blocking write if the only dirty buffers are loop buffers
1925 * (otherwise we go into an infinite busy-loop).
1926 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1927 * with running out of free buffers for loop's "real" device).
1929 int bdflush(void * unused)
1931 int i;
1932 int ndirty;
1933 int nlist;
1934 int ncount;
1935 struct buffer_head * bh, *next;
1936 int major;
1937 int wrta_cmd = WRITEA; /* non-blocking write for LOOP */
1940 * We have a bare-bones task_struct, and really should fill
1941 * in a few more things so "top" and /proc/2/{exe,root,cwd}
1942 * display semi-sane things. Not real crucial though...
1945 current->session = 1;
1946 current->pgrp = 1;
1947 sprintf(current->comm, "kflushd");
1948 bdflush_tsk = current;
1951 * As a kernel thread we want to tamper with system buffers
1952 * and other internals and thus be subject to the SMP locking
1953 * rules. (On a uniprocessor box this does nothing).
1955 lock_kernel();
1957 for (;;) {
1958 #ifdef DEBUG
1959 printk("bdflush() activated...");
1960 #endif
1962 CHECK_EMERGENCY_SYNC
1964 ncount = 0;
1965 #ifdef DEBUG
1966 for(nlist = 0; nlist < NR_LIST; nlist++)
1967 #else
1968 for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1969 #endif
1971 ndirty = 0;
1972 refilled = 0;
1973 repeat:
1975 bh = lru_list[nlist];
1976 if(bh)
1977 for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1978 bh = next) {
1979 /* We may have stalled while waiting for I/O to complete. */
1980 if(bh->b_list != nlist) goto repeat;
1981 next = bh->b_next_free;
1982 if(!lru_list[nlist]) {
1983 printk("Dirty list empty %d\n", i);
1984 break;
1987 /* Clean buffer on dirty list? Refile it */
1988 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1990 refile_buffer(bh);
1991 continue;
1994 if (buffer_locked(bh) || !buffer_dirty(bh))
1995 continue;
1996 major = MAJOR(bh->b_dev);
1997 /* Should we write back buffers that are shared or not??
1998 currently dirty buffers are not shared, so it does not matter */
1999 if (refilled && major == LOOP_MAJOR)
2000 continue;
2001 next->b_count++;
2002 bh->b_count++;
2003 ndirty++;
2004 bh->b_flushtime = 0;
2005 if (major == LOOP_MAJOR) {
2006 ll_rw_block(wrta_cmd,1, &bh);
2007 wrta_cmd = WRITEA;
2008 if (buffer_dirty(bh))
2009 --ndirty;
2011 else
2012 ll_rw_block(WRITE, 1, &bh);
2013 #ifdef DEBUG
2014 if(nlist != BUF_DIRTY) ncount++;
2015 #endif
2016 bh->b_count--;
2017 next->b_count--;
2020 #ifdef DEBUG
2021 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
2022 printk("sleeping again.\n");
2023 #endif
2024 /* If we didn't write anything, but there are still
2025 * dirty buffers, then make the next write to a
2026 * loop device to be a blocking write.
2027 * This lets us block--which we _must_ do! */
2028 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
2029 wrta_cmd = WRITE;
2030 continue;
2032 run_task_queue(&tq_disk);
2033 wake_up(&bdflush_done);
2035 /* If there are still a lot of dirty buffers around, skip the sleep
2036 and flush some more */
2037 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
2038 spin_lock_irq(&current->sigmask_lock);
2039 flush_signals(current);
2040 spin_unlock_irq(&current->sigmask_lock);
2042 interruptible_sleep_on(&bdflush_wait);