Import 2.3.7pre9
[davej-history.git] / fs / buffer.c
blobb6474f45183d3b495003d09d05556ff79b815198
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 #include <linux/malloc.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/swap.h>
31 #include <linux/swapctl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/vmalloc.h>
34 #include <linux/blkdev.h>
35 #include <linux/sysrq.h>
36 #include <linux/file.h>
37 #include <linux/init.h>
38 #include <linux/quotaops.h>
40 #include <asm/uaccess.h>
41 #include <asm/io.h>
42 #include <asm/bitops.h>
44 #define NR_SIZES 7
45 static char buffersize_index[65] =
46 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
47 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
48 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
50 6};
52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
56 number of unused buffer heads */
59 * Hash table mask..
61 static unsigned long bh_hash_mask = 0;
63 static int grow_buffers(int size);
65 static struct buffer_head ** hash_table;
66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
69 static kmem_cache_t *bh_cachep;
71 static struct buffer_head * unused_list = NULL;
72 static struct buffer_head * reuse_list = NULL;
73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
75 static int nr_buffers = 0;
76 static int nr_buffers_type[NR_LIST] = {0,};
77 static int nr_buffer_heads = 0;
78 static int nr_unused_buffer_heads = 0;
79 static int nr_hashed_buffers = 0;
81 /* This is used by some architectures to estimate available memory. */
82 int buffermem = 0;
84 /* Here is the parameter block for the bdflush process. If you add or
85 * remove any of the parameters, make sure to update kernel/sysctl.c.
88 #define N_PARAM 9
90 /* The dummy values in this structure are left in there for compatibility
91 * with old programs that play with the /proc entries.
93 union bdflush_param {
94 struct {
95 int nfract; /* Percentage of buffer cache dirty to
96 activate bdflush */
97 int ndirty; /* Maximum number of dirty blocks to write out per
98 wake-cycle */
99 int nrefill; /* Number of clean buffers to try to obtain
100 each time we call refill */
101 int nref_dirt; /* Dirty buffer threshold for activating bdflush
102 when trying to refill buffers. */
103 int dummy1; /* unused */
104 int age_buffer; /* Time for normal buffer to age before
105 we flush it */
106 int age_super; /* Time for superblock to age before we
107 flush it */
108 int dummy2; /* unused */
109 int dummy3; /* unused */
110 } b_un;
111 unsigned int data[N_PARAM];
112 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
114 /* These are the min and max parameter values that we will allow to be assigned */
115 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
116 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
118 void wakeup_bdflush(int);
121 * Rewrote the wait-routines to use the "new" wait-queue functionality,
122 * and getting rid of the cli-sti pairs. The wait-queue routines still
123 * need cli-sti, but now it's just a couple of 386 instructions or so.
125 * Note that the real wait_on_buffer() is an inline function that checks
126 * if 'b_wait' is set before calling this, so that the queues aren't set
127 * up unnecessarily.
129 void __wait_on_buffer(struct buffer_head * bh)
131 struct task_struct *tsk = current;
132 DECLARE_WAITQUEUE(wait, tsk);
134 bh->b_count++;
135 add_wait_queue(&bh->b_wait, &wait);
136 repeat:
137 tsk->state = TASK_UNINTERRUPTIBLE;
138 run_task_queue(&tq_disk);
139 if (buffer_locked(bh)) {
140 schedule();
141 goto repeat;
143 tsk->state = TASK_RUNNING;
144 remove_wait_queue(&bh->b_wait, &wait);
145 bh->b_count--;
148 /* Call sync_buffers with wait!=0 to ensure that the call does not
149 * return until all buffer writes have completed. Sync() may return
150 * before the writes have finished; fsync() may not.
153 /* Godamity-damn. Some buffers (bitmaps for filesystems)
154 * spontaneously dirty themselves without ever brelse being called.
155 * We will ultimately want to put these in a separate list, but for
156 * now we search all of the lists for dirty buffers.
158 static int sync_buffers(kdev_t dev, int wait)
160 int i, retry, pass = 0, err = 0;
161 struct buffer_head * bh, *next;
163 /* One pass for no-wait, three for wait:
164 * 0) write out all dirty, unlocked buffers;
165 * 1) write out all dirty buffers, waiting if locked;
166 * 2) wait for completion by waiting for all buffers to unlock.
168 do {
169 retry = 0;
170 repeat:
171 /* We search all lists as a failsafe mechanism, not because we expect
172 * there to be dirty buffers on any of the other lists.
174 bh = lru_list[BUF_DIRTY];
175 if (!bh)
176 goto repeat2;
177 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
178 if (bh->b_list != BUF_DIRTY)
179 goto repeat;
180 next = bh->b_next_free;
181 if (!lru_list[BUF_DIRTY])
182 break;
183 if (dev && bh->b_dev != dev)
184 continue;
185 if (buffer_locked(bh)) {
186 /* Buffer is locked; skip it unless wait is
187 * requested AND pass > 0.
189 if (!wait || !pass) {
190 retry = 1;
191 continue;
193 wait_on_buffer (bh);
194 goto repeat;
197 /* If an unlocked buffer is not uptodate, there has
198 * been an IO error. Skip it.
200 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
201 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
202 err = -EIO;
203 continue;
206 /* Don't write clean buffers. Don't write ANY buffers
207 * on the third pass.
209 if (!buffer_dirty(bh) || pass >= 2)
210 continue;
212 /* Don't bother about locked buffers.
214 * XXX We checked if it was locked above and there is no
215 * XXX way we could have slept in between. -DaveM
217 if (buffer_locked(bh))
218 continue;
219 bh->b_count++;
220 next->b_count++;
221 bh->b_flushtime = 0;
222 ll_rw_block(WRITE, 1, &bh);
223 bh->b_count--;
224 next->b_count--;
225 retry = 1;
228 repeat2:
229 bh = lru_list[BUF_LOCKED];
230 if (!bh)
231 break;
232 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
233 if (bh->b_list != BUF_LOCKED)
234 goto repeat2;
235 next = bh->b_next_free;
236 if (!lru_list[BUF_LOCKED])
237 break;
238 if (dev && bh->b_dev != dev)
239 continue;
240 if (buffer_locked(bh)) {
241 /* Buffer is locked; skip it unless wait is
242 * requested AND pass > 0.
244 if (!wait || !pass) {
245 retry = 1;
246 continue;
248 wait_on_buffer (bh);
249 goto repeat2;
253 /* If we are waiting for the sync to succeed, and if any dirty
254 * blocks were written, then repeat; on the second pass, only
255 * wait for buffers being written (do not pass to write any
256 * more buffers on the second pass).
258 } while (wait && retry && ++pass<=2);
259 return err;
262 void sync_dev(kdev_t dev)
264 sync_buffers(dev, 0);
265 sync_supers(dev);
266 sync_inodes(dev);
267 sync_buffers(dev, 0);
268 DQUOT_SYNC(dev);
270 * FIXME(eric) we need to sync the physical devices here.
271 * This is because some (scsi) controllers have huge amounts of
272 * cache onboard (hundreds of Mb), and we need to instruct
273 * them to commit all of the dirty memory to disk, and we should
274 * not return until this has happened.
276 * This would need to get implemented by going through the assorted
277 * layers so that each block major number can be synced, and this
278 * would call down into the upper and mid-layer scsi.
282 int fsync_dev(kdev_t dev)
284 sync_buffers(dev, 0);
285 sync_supers(dev);
286 sync_inodes(dev);
287 DQUOT_SYNC(dev);
288 return sync_buffers(dev, 1);
291 asmlinkage int sys_sync(void)
293 lock_kernel();
294 fsync_dev(0);
295 unlock_kernel();
296 return 0;
300 * filp may be NULL if called via the msync of a vma.
303 int file_fsync(struct file *filp, struct dentry *dentry)
305 struct inode * inode = dentry->d_inode;
306 struct super_block * sb;
307 kdev_t dev;
309 /* sync the inode to buffers */
310 write_inode_now(inode);
312 /* sync the superblock to buffers */
313 sb = inode->i_sb;
314 wait_on_super(sb);
315 if (sb->s_op && sb->s_op->write_super)
316 sb->s_op->write_super(sb);
318 /* .. finally sync the buffers to disk */
319 dev = inode->i_dev;
320 return sync_buffers(dev, 1);
323 asmlinkage int sys_fsync(unsigned int fd)
325 struct file * file;
326 struct dentry * dentry;
327 struct inode * inode;
328 int err;
330 lock_kernel();
331 err = -EBADF;
332 file = fget(fd);
333 if (!file)
334 goto out;
336 dentry = file->f_dentry;
337 if (!dentry)
338 goto out_putf;
340 inode = dentry->d_inode;
341 if (!inode)
342 goto out_putf;
344 err = -EINVAL;
345 if (!file->f_op || !file->f_op->fsync)
346 goto out_putf;
348 /* We need to protect against concurrent writers.. */
349 down(&inode->i_sem);
350 err = file->f_op->fsync(file, dentry);
351 up(&inode->i_sem);
353 out_putf:
354 fput(file);
355 out:
356 unlock_kernel();
357 return err;
360 asmlinkage int sys_fdatasync(unsigned int fd)
362 struct file * file;
363 struct dentry * dentry;
364 struct inode * inode;
365 int err;
367 lock_kernel();
368 err = -EBADF;
369 file = fget(fd);
370 if (!file)
371 goto out;
373 dentry = file->f_dentry;
374 if (!dentry)
375 goto out_putf;
377 inode = dentry->d_inode;
378 if (!inode)
379 goto out_putf;
381 err = -EINVAL;
382 if (!file->f_op || !file->f_op->fsync)
383 goto out_putf;
385 /* this needs further work, at the moment it is identical to fsync() */
386 down(&inode->i_sem);
387 err = file->f_op->fsync(file, dentry);
388 up(&inode->i_sem);
390 out_putf:
391 fput(file);
392 out:
393 unlock_kernel();
394 return err;
397 void invalidate_buffers(kdev_t dev)
399 int i;
400 int nlist;
401 struct buffer_head * bh;
403 for(nlist = 0; nlist < NR_LIST; nlist++) {
404 bh = lru_list[nlist];
405 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
406 if (bh->b_dev != dev)
407 continue;
408 wait_on_buffer(bh);
409 if (bh->b_dev != dev)
410 continue;
411 if (bh->b_count)
412 continue;
413 bh->b_flushtime = 0;
414 clear_bit(BH_Protected, &bh->b_state);
415 clear_bit(BH_Uptodate, &bh->b_state);
416 clear_bit(BH_Dirty, &bh->b_state);
417 clear_bit(BH_Req, &bh->b_state);
422 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
423 #define hash(dev,block) hash_table[_hashfn(dev,block)]
425 static void insert_into_hash_list(struct buffer_head * bh)
427 bh->b_next = NULL;
428 bh->b_pprev = NULL;
429 if (bh->b_dev) {
430 struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
431 struct buffer_head *next = *bhp;
433 if (next) {
434 bh->b_next = next;
435 next->b_pprev = &bh->b_next;
437 *bhp = bh;
438 bh->b_pprev = bhp;
439 nr_hashed_buffers++;
443 static void remove_from_hash_queue(struct buffer_head * bh)
445 struct buffer_head **pprev = bh->b_pprev;
446 if (pprev) {
447 struct buffer_head * next = bh->b_next;
448 if (next) {
449 next->b_pprev = pprev;
450 bh->b_next = NULL;
452 *pprev = next;
453 bh->b_pprev = NULL;
454 nr_hashed_buffers--;
458 static void insert_into_lru_list(struct buffer_head * bh)
460 struct buffer_head **bhp = &lru_list[bh->b_list];
462 if (bh->b_dev == B_FREE)
463 BUG();
465 if(!*bhp) {
466 *bhp = bh;
467 bh->b_prev_free = bh;
470 if (bh->b_next_free)
471 panic("VFS: buffer LRU pointers corrupted");
473 bh->b_next_free = *bhp;
474 bh->b_prev_free = (*bhp)->b_prev_free;
475 (*bhp)->b_prev_free->b_next_free = bh;
476 (*bhp)->b_prev_free = bh;
478 nr_buffers++;
479 nr_buffers_type[bh->b_list]++;
482 static void remove_from_lru_list(struct buffer_head * bh)
484 if (!(bh->b_prev_free) || !(bh->b_next_free))
485 return;
487 if (bh->b_dev == B_FREE) {
488 printk("LRU list corrupted");
489 *(int*)0 = 0;
491 bh->b_prev_free->b_next_free = bh->b_next_free;
492 bh->b_next_free->b_prev_free = bh->b_prev_free;
494 if (lru_list[bh->b_list] == bh)
495 lru_list[bh->b_list] = bh->b_next_free;
496 if (lru_list[bh->b_list] == bh)
497 lru_list[bh->b_list] = NULL;
498 bh->b_next_free = bh->b_prev_free = NULL;
500 nr_buffers--;
501 nr_buffers_type[bh->b_list]--;
504 static void remove_from_free_list(struct buffer_head * bh)
506 int isize = BUFSIZE_INDEX(bh->b_size);
507 if (!(bh->b_prev_free) || !(bh->b_next_free))
508 panic("VFS: Free block list corrupted");
509 if(bh->b_dev != B_FREE)
510 panic("Free list corrupted");
511 if(!free_list[isize])
512 panic("Free list empty");
513 if(bh->b_next_free == bh)
514 free_list[isize] = NULL;
515 else {
516 bh->b_prev_free->b_next_free = bh->b_next_free;
517 bh->b_next_free->b_prev_free = bh->b_prev_free;
518 if (free_list[isize] == bh)
519 free_list[isize] = bh->b_next_free;
521 bh->b_next_free = bh->b_prev_free = NULL;
524 static void remove_from_queues(struct buffer_head * bh)
526 if (bh->b_dev == B_FREE)
527 BUG();
528 remove_from_hash_queue(bh);
529 remove_from_lru_list(bh);
532 static void put_last_free(struct buffer_head * bh)
534 if (bh) {
535 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
537 if (bh->b_count)
538 BUG();
540 bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */
542 /* Add to back of free list. */
543 if(!*bhp) {
544 *bhp = bh;
545 bh->b_prev_free = bh;
548 bh->b_next_free = *bhp;
549 bh->b_prev_free = (*bhp)->b_prev_free;
550 (*bhp)->b_prev_free->b_next_free = bh;
551 (*bhp)->b_prev_free = bh;
555 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
557 struct buffer_head * next;
559 next = hash(dev,block);
560 for (;;) {
561 struct buffer_head *tmp = next;
562 if (!next)
563 break;
564 next = tmp->b_next;
565 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
566 continue;
567 next = tmp;
568 break;
570 return next;
574 * Why like this, I hear you say... The reason is race-conditions.
575 * As we don't lock buffers (unless we are reading them, that is),
576 * something might happen to it while we sleep (ie a read-error
577 * will force it bad). This shouldn't really happen currently, but
578 * the code is ready.
580 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
582 struct buffer_head * bh;
583 bh = find_buffer(dev,block,size);
584 if (bh)
585 bh->b_count++;
586 return bh;
589 unsigned int get_hardblocksize(kdev_t dev)
592 * Get the hard sector size for the given device. If we don't know
593 * what it is, return 0.
595 if (hardsect_size[MAJOR(dev)] != NULL) {
596 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
597 if (blksize != 0)
598 return blksize;
602 * We don't know what the hardware sector size for this device is.
603 * Return 0 indicating that we don't know.
605 return 0;
608 void set_blocksize(kdev_t dev, int size)
610 extern int *blksize_size[];
611 int i, nlist;
612 struct buffer_head * bh, *bhnext;
614 if (!blksize_size[MAJOR(dev)])
615 return;
617 /* Size must be a power of two, and between 512 and PAGE_SIZE */
618 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
619 panic("Invalid blocksize passed to set_blocksize");
621 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
622 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
623 return;
625 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
626 return;
627 sync_buffers(dev, 2);
628 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
630 /* We need to be quite careful how we do this - we are moving entries
631 * around on the free list, and we can get in a loop if we are not careful.
633 for(nlist = 0; nlist < NR_LIST; nlist++) {
634 bh = lru_list[nlist];
635 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
636 if(!bh)
637 break;
639 bhnext = bh->b_next_free;
640 if (bh->b_dev != dev)
641 continue;
642 if (bh->b_size == size)
643 continue;
644 bhnext->b_count++;
645 bh->b_count++;
646 wait_on_buffer(bh);
647 bhnext->b_count--;
648 if (bh->b_dev == dev && bh->b_size != size) {
649 clear_bit(BH_Dirty, &bh->b_state);
650 clear_bit(BH_Uptodate, &bh->b_state);
651 clear_bit(BH_Req, &bh->b_state);
652 bh->b_flushtime = 0;
654 if (--bh->b_count)
655 continue;
656 remove_from_queues(bh);
657 put_last_free(bh);
663 * We used to try various strange things. Let's not.
665 static void refill_freelist(int size)
667 if (!grow_buffers(size)) {
668 wakeup_bdflush(1);
669 current->policy |= SCHED_YIELD;
670 schedule();
674 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
675 bh_end_io_t *handler, void *dev_id)
677 bh->b_list = BUF_CLEAN;
678 bh->b_flushtime = 0;
679 bh->b_dev = dev;
680 bh->b_blocknr = block;
681 bh->b_end_io = handler;
682 bh->b_dev_id = dev_id;
685 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
687 mark_buffer_uptodate(bh, uptodate);
688 unlock_buffer(bh);
692 * Ok, this is getblk, and it isn't very clear, again to hinder
693 * race-conditions. Most of the code is seldom used, (ie repeating),
694 * so it should be much more efficient than it looks.
696 * The algorithm is changed: hopefully better, and an elusive bug removed.
698 * 14.02.92: changed it to sync dirty buffers a bit: better performance
699 * when the filesystem starts to get full of dirty blocks (I hope).
701 struct buffer_head * getblk(kdev_t dev, int block, int size)
703 struct buffer_head * bh;
704 int isize;
706 repeat:
707 bh = get_hash_table(dev, block, size);
708 if (bh) {
709 if (!buffer_dirty(bh)) {
710 bh->b_flushtime = 0;
712 goto out;
715 isize = BUFSIZE_INDEX(size);
716 get_free:
717 bh = free_list[isize];
718 if (!bh)
719 goto refill;
720 remove_from_free_list(bh);
722 /* OK, FINALLY we know that this buffer is the only one of its kind,
723 * and that it's unused (b_count=0), unlocked, and clean.
725 init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
726 bh->b_count = 1;
727 bh->b_state = 0;
729 /* Insert the buffer into the regular lists */
730 insert_into_lru_list(bh);
731 insert_into_hash_list(bh);
732 goto out;
735 * If we block while refilling the free list, somebody may
736 * create the buffer first ... search the hashes again.
738 refill:
739 refill_freelist(size);
740 if (!find_buffer(dev,block,size))
741 goto get_free;
742 goto repeat;
743 out:
744 return bh;
747 void set_writetime(struct buffer_head * buf, int flag)
749 int newtime;
751 if (buffer_dirty(buf)) {
752 /* Move buffer to dirty list if jiffies is clear. */
753 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
754 bdf_prm.b_un.age_buffer);
755 if(!buf->b_flushtime || buf->b_flushtime > newtime)
756 buf->b_flushtime = newtime;
757 } else {
758 buf->b_flushtime = 0;
763 * Put a buffer into the appropriate list, without side-effects.
765 static void file_buffer(struct buffer_head *bh, int list)
767 remove_from_lru_list(bh);
768 bh->b_list = list;
769 insert_into_lru_list(bh);
773 * if a new dirty buffer is created we need to balance bdflush.
775 static inline void balance_dirty (kdev_t dev)
777 int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
779 /* This buffer is dirty, maybe we need to start flushing.
780 * If too high a percentage of the buffers are dirty...
782 if (nr_buffers_type[BUF_DIRTY] > too_many) {
783 wakeup_bdflush(1);
786 /* If this is a loop device, and
787 * more than half of the buffers are dirty...
788 * (Prevents no-free-buffers deadlock with loop device.)
790 if (MAJOR(dev) == LOOP_MAJOR &&
791 nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
792 wakeup_bdflush(1);
796 * A buffer may need to be moved from one buffer list to another
797 * (e.g. in case it is not shared any more). Handle this.
799 void refile_buffer(struct buffer_head * buf)
801 int dispose;
803 if(buf->b_dev == B_FREE) {
804 printk("Attempt to refile free buffer\n");
805 return;
807 if (buffer_dirty(buf))
808 dispose = BUF_DIRTY;
809 else if (buffer_locked(buf))
810 dispose = BUF_LOCKED;
811 else
812 dispose = BUF_CLEAN;
813 if(dispose != buf->b_list) {
814 file_buffer(buf, dispose);
815 if (dispose == BUF_DIRTY)
816 balance_dirty(buf->b_dev);
821 * Release a buffer head
823 void __brelse(struct buffer_head * buf)
825 /* If dirty, mark the time this buffer should be written back. */
826 set_writetime(buf, 0);
827 refile_buffer(buf);
828 touch_buffer(buf);
830 if (buf->b_count) {
831 buf->b_count--;
832 wake_up(&buffer_wait);
833 return;
835 printk("VFS: brelse: Trying to free free buffer\n");
839 * bforget() is like brelse(), except it puts the buffer on the
840 * free list if it can.. We can NOT free the buffer if:
841 * - there are other users of it
842 * - it is locked and thus can have active IO
844 void __bforget(struct buffer_head * buf)
846 if (buf->b_count != 1 || buffer_locked(buf)) {
847 __brelse(buf);
848 return;
850 buf->b_count = 0;
851 buf->b_state = 0;
852 remove_from_queues(buf);
853 put_last_free(buf);
857 * bread() reads a specified block and returns the buffer that contains
858 * it. It returns NULL if the block was unreadable.
860 struct buffer_head * bread(kdev_t dev, int block, int size)
862 struct buffer_head * bh;
864 bh = getblk(dev, block, size);
865 if (buffer_uptodate(bh))
866 return bh;
867 ll_rw_block(READ, 1, &bh);
868 wait_on_buffer(bh);
869 if (buffer_uptodate(bh))
870 return bh;
871 brelse(bh);
872 return NULL;
876 * Ok, breada can be used as bread, but additionally to mark other
877 * blocks for reading as well. End the argument list with a negative
878 * number.
881 #define NBUF 16
883 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
884 unsigned int pos, unsigned int filesize)
886 struct buffer_head * bhlist[NBUF];
887 unsigned int blocks;
888 struct buffer_head * bh;
889 int index;
890 int i, j;
892 if (pos >= filesize)
893 return NULL;
895 if (block < 0)
896 return NULL;
898 bh = getblk(dev, block, bufsize);
899 index = BUFSIZE_INDEX(bh->b_size);
901 if (buffer_uptodate(bh))
902 return(bh);
903 else ll_rw_block(READ, 1, &bh);
905 blocks = (filesize - pos) >> (9+index);
907 if (blocks < (read_ahead[MAJOR(dev)] >> index))
908 blocks = read_ahead[MAJOR(dev)] >> index;
909 if (blocks > NBUF)
910 blocks = NBUF;
912 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
914 bhlist[0] = bh;
915 j = 1;
916 for(i=1; i<blocks; i++) {
917 bh = getblk(dev,block+i,bufsize);
918 if (buffer_uptodate(bh)) {
919 brelse(bh);
920 break;
922 else bhlist[j++] = bh;
925 /* Request the read for these buffers, and then release them. */
926 if (j>1)
927 ll_rw_block(READA, (j-1), bhlist+1);
928 for(i=1; i<j; i++)
929 brelse(bhlist[i]);
931 /* Wait for this buffer, and then continue on. */
932 bh = bhlist[0];
933 wait_on_buffer(bh);
934 if (buffer_uptodate(bh))
935 return bh;
936 brelse(bh);
937 return NULL;
941 * Note: the caller should wake up the buffer_wait list if needed.
943 static void put_unused_buffer_head(struct buffer_head * bh)
945 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
946 nr_buffer_heads--;
947 kmem_cache_free(bh_cachep, bh);
948 return;
951 // memset(bh, 0, sizeof(*bh));
952 bh->b_blocknr = -1;
953 init_waitqueue_head(&bh->b_wait);
954 nr_unused_buffer_heads++;
955 bh->b_next_free = unused_list;
956 unused_list = bh;
960 * We can't put completed temporary IO buffer_heads directly onto the
961 * unused_list when they become unlocked, since the device driver
962 * end_request routines still expect access to the buffer_head's
963 * fields after the final unlock. So, the device driver puts them on
964 * the reuse_list instead once IO completes, and we recover these to
965 * the unused_list here.
967 * Note that we don't do a wakeup here, but return a flag indicating
968 * whether we got any buffer heads. A task ready to sleep can check
969 * the returned value, and any tasks already sleeping will have been
970 * awakened when the buffer heads were added to the reuse list.
972 static inline int recover_reusable_buffer_heads(void)
974 struct buffer_head *head = xchg(&reuse_list, NULL);
975 int found = 0;
977 if (head) {
978 do {
979 struct buffer_head *bh = head;
980 head = head->b_next_free;
981 put_unused_buffer_head(bh);
982 } while (head);
983 found = 1;
985 return found;
989 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
990 * no-buffer-head deadlock. Return NULL on failure; waiting for
991 * buffer heads is now handled in create_buffers().
993 static struct buffer_head * get_unused_buffer_head(int async)
995 struct buffer_head * bh;
997 recover_reusable_buffer_heads();
998 if (nr_unused_buffer_heads > NR_RESERVED) {
999 bh = unused_list;
1000 unused_list = bh->b_next_free;
1001 nr_unused_buffer_heads--;
1002 return bh;
1005 /* This is critical. We can't swap out pages to get
1006 * more buffer heads, because the swap-out may need
1007 * more buffer-heads itself. Thus SLAB_BUFFER.
1009 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1010 memset(bh, 0, sizeof(*bh));
1011 init_waitqueue_head(&bh->b_wait);
1012 nr_buffer_heads++;
1013 return bh;
1017 * If we need an async buffer, use the reserved buffer heads.
1019 if (async && unused_list) {
1020 bh = unused_list;
1021 unused_list = bh->b_next_free;
1022 nr_unused_buffer_heads--;
1023 return bh;
1026 #if 0
1028 * (Pending further analysis ...)
1029 * Ordinary (non-async) requests can use a different memory priority
1030 * to free up pages. Any swapping thus generated will use async
1031 * buffer heads.
1033 if(!async &&
1034 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1035 memset(bh, 0, sizeof(*bh));
1036 init_waitqueue_head(&bh->b_wait);
1037 nr_buffer_heads++;
1038 return bh;
1040 #endif
1042 return NULL;
1046 * Create the appropriate buffers when given a page for data area and
1047 * the size of each buffer.. Use the bh->b_this_page linked list to
1048 * follow the buffers created. Return NULL if unable to create more
1049 * buffers.
1050 * The async flag is used to differentiate async IO (paging, swapping)
1051 * from ordinary buffer allocations, and only async requests are allowed
1052 * to sleep waiting for buffer heads.
1054 static struct buffer_head * create_buffers(unsigned long page,
1055 unsigned long size, int async)
1057 DECLARE_WAITQUEUE(wait, current);
1058 struct buffer_head *bh, *head;
1059 long offset;
1061 try_again:
1062 head = NULL;
1063 offset = PAGE_SIZE;
1064 while ((offset -= size) >= 0) {
1065 bh = get_unused_buffer_head(async);
1066 if (!bh)
1067 goto no_grow;
1069 bh->b_dev = B_FREE; /* Flag as unused */
1070 bh->b_this_page = head;
1071 head = bh;
1073 bh->b_state = 0;
1074 bh->b_next_free = NULL;
1075 bh->b_count = 0;
1076 bh->b_size = size;
1078 bh->b_data = (char *) (page+offset);
1079 bh->b_list = 0;
1081 return head;
1083 * In case anything failed, we just free everything we got.
1085 no_grow:
1086 if (head) {
1087 do {
1088 bh = head;
1089 head = head->b_this_page;
1090 put_unused_buffer_head(bh);
1091 } while (head);
1093 /* Wake up any waiters ... */
1094 wake_up(&buffer_wait);
1098 * Return failure for non-async IO requests. Async IO requests
1099 * are not allowed to fail, so we have to wait until buffer heads
1100 * become available. But we don't want tasks sleeping with
1101 * partially complete buffers, so all were released above.
1103 if (!async)
1104 return NULL;
1106 /* We're _really_ low on memory. Now we just
1107 * wait for old buffer heads to become free due to
1108 * finishing IO. Since this is an async request and
1109 * the reserve list is empty, we're sure there are
1110 * async buffer heads in use.
1112 run_task_queue(&tq_disk);
1115 * Set our state for sleeping, then check again for buffer heads.
1116 * This ensures we won't miss a wake_up from an interrupt.
1118 add_wait_queue(&buffer_wait, &wait);
1119 current->state = TASK_UNINTERRUPTIBLE;
1120 if (!recover_reusable_buffer_heads())
1121 schedule();
1122 remove_wait_queue(&buffer_wait, &wait);
1123 current->state = TASK_RUNNING;
1124 goto try_again;
1127 /* Run the hooks that have to be done when a page I/O has completed. */
1128 static inline void after_unlock_page (struct page * page)
1130 if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1131 atomic_dec(&nr_async_pages);
1132 #ifdef DEBUG_SWAP
1133 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1134 (char *) page_address(page),
1135 atomic_read(&nr_async_pages));
1136 #endif
1138 if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1139 swap_after_unlock_page(page->offset);
1140 if (test_and_clear_bit(PG_free_after, &page->flags))
1141 __free_page(page);
1145 * Free all temporary buffers belonging to a page.
1146 * This needs to be called with interrupts disabled.
1148 static inline void free_async_buffers (struct buffer_head * bh)
1150 struct buffer_head *tmp, *tail;
1153 * Link all the buffers into the b_next_free list,
1154 * so we only have to do one xchg() operation ...
1156 tail = bh;
1157 while ((tmp = tail->b_this_page) != bh) {
1158 tail->b_next_free = tmp;
1159 tail = tmp;
1162 /* Update the reuse list */
1163 tail->b_next_free = xchg(&reuse_list, NULL);
1164 reuse_list = bh;
1166 /* Wake up any waiters ... */
1167 wake_up(&buffer_wait);
1170 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1172 unsigned long flags;
1173 struct buffer_head *tmp;
1174 struct page *page;
1176 mark_buffer_uptodate(bh, uptodate);
1178 /* This is a temporary buffer used for page I/O. */
1179 page = mem_map + MAP_NR(bh->b_data);
1181 if (!uptodate)
1182 SetPageError(page);
1185 * Be _very_ careful from here on. Bad things can happen if
1186 * two buffer heads end IO at almost the same time and both
1187 * decide that the page is now completely done.
1189 * Async buffer_heads are here only as labels for IO, and get
1190 * thrown away once the IO for this page is complete. IO is
1191 * deemed complete once all buffers have been visited
1192 * (b_count==0) and are now unlocked. We must make sure that
1193 * only the _last_ buffer that decrements its count is the one
1194 * that free's the page..
1196 save_flags(flags);
1197 cli();
1198 unlock_buffer(bh);
1199 tmp = bh->b_this_page;
1200 while (tmp != bh) {
1201 if (buffer_locked(tmp))
1202 goto still_busy;
1203 tmp = tmp->b_this_page;
1206 /* OK, the async IO on this page is complete. */
1207 restore_flags(flags);
1209 after_unlock_page(page);
1211 * if none of the buffers had errors then we can set the
1212 * page uptodate:
1214 if (!PageError(page))
1215 SetPageUptodate(page);
1216 if (page->owner != -1)
1217 PAGE_BUG(page);
1218 page->owner = (int)current;
1219 UnlockPage(page);
1221 return;
1223 still_busy:
1224 restore_flags(flags);
1225 return;
1228 static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1230 struct buffer_head *head, *bh, *tail;
1231 int block;
1233 if (!PageLocked(page))
1234 BUG();
1235 if (page->owner != (int)current)
1236 PAGE_BUG(page);
1238 * Allocate async buffer heads pointing to this page, just for I/O.
1239 * They show up in the buffer hash table and are registered in
1240 * page->buffers.
1242 head = create_buffers(page_address(page), size, 1);
1243 if (page->buffers)
1244 BUG();
1245 if (!head)
1246 BUG();
1247 tail = head;
1248 for (bh = head; bh; bh = bh->b_this_page) {
1249 block = *(b++);
1251 tail = bh;
1252 init_buffer(bh, dev, block, end_buffer_io_async, NULL);
1255 * When we use bmap, we define block zero to represent
1256 * a hole. ll_rw_page, however, may legitimately
1257 * access block zero, and we need to distinguish the
1258 * two cases.
1260 if (bmap && !block) {
1261 set_bit(BH_Uptodate, &bh->b_state);
1262 memset(bh->b_data, 0, size);
1265 tail->b_this_page = head;
1266 get_page(page);
1267 page->buffers = head;
1268 return 0;
1272 * We don't have to release all buffers here, but
1273 * we have to be sure that no dirty buffer is left
1274 * and no IO is going on (no buffer is locked), because
1275 * we have truncated the file and are going to free the
1276 * blocks on-disk..
1278 int generic_block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1280 struct buffer_head *head, *bh, *next;
1281 unsigned int curr_off = 0;
1283 if (!PageLocked(page))
1284 BUG();
1285 if (!page->buffers)
1286 return 0;
1288 head = page->buffers;
1289 bh = head;
1290 do {
1291 unsigned int next_off = curr_off + bh->b_size;
1292 next = bh->b_this_page;
1295 * is this block fully flushed?
1297 if (offset <= curr_off) {
1298 if (bh->b_blocknr) {
1299 bh->b_count++;
1300 wait_on_buffer(bh);
1301 if (bh->b_dev == B_FREE)
1302 BUG();
1303 mark_buffer_clean(bh);
1304 bh->b_blocknr = 0;
1305 bh->b_count--;
1308 curr_off = next_off;
1309 bh = next;
1310 } while (bh != head);
1313 * subtle. We release buffer-heads only if this is
1314 * the 'final' flushpage. We invalidate the bmap
1315 * cached value in all cases.
1317 if (!offset)
1318 try_to_free_buffers(page);
1320 return 0;
1323 static inline void create_empty_buffers (struct page *page,
1324 struct inode *inode, unsigned long blocksize)
1326 struct buffer_head *bh, *head, *tail;
1328 head = create_buffers(page_address(page), blocksize, 1);
1329 if (page->buffers)
1330 BUG();
1332 bh = head;
1333 do {
1334 bh->b_dev = inode->i_dev;
1335 bh->b_blocknr = 0;
1336 tail = bh;
1337 bh = bh->b_this_page;
1338 } while (bh);
1339 tail->b_this_page = head;
1340 page->buffers = head;
1341 get_page(page);
1344 int block_write_full_page (struct file *file, struct page *page, fs_getblock_t fs_get_block)
1346 struct dentry *dentry = file->f_dentry;
1347 struct inode *inode = dentry->d_inode;
1348 int err, created, i;
1349 unsigned long block, phys, offset;
1350 struct buffer_head *bh, *head;
1352 if (!PageLocked(page))
1353 BUG();
1355 if (!page->buffers)
1356 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1357 head = page->buffers;
1359 offset = page->offset;
1360 block = offset >> inode->i_sb->s_blocksize_bits;
1362 // FIXME: currently we assume page alignment.
1363 if (offset & (PAGE_SIZE-1))
1364 BUG();
1366 bh = head;
1367 i = 0;
1368 do {
1369 if (!bh)
1370 BUG();
1372 if (!bh->b_blocknr) {
1373 err = -EIO;
1374 down(&inode->i_sem);
1375 phys = fs_get_block (inode, block, 1, &err, &created);
1376 up(&inode->i_sem);
1377 if (!phys)
1378 goto out;
1380 init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
1381 bh->b_state = (1<<BH_Uptodate);
1382 } else {
1384 * block already exists, just mark it dirty:
1386 bh->b_end_io = end_buffer_io_sync;
1387 set_bit(BH_Uptodate, &bh->b_state);
1389 mark_buffer_dirty(bh, 0);
1391 bh = bh->b_this_page;
1392 block++;
1393 } while (bh != head);
1395 SetPageUptodate(page);
1396 return 0;
1397 out:
1398 ClearPageUptodate(page);
1399 return err;
1402 int block_write_one_page (struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf, fs_getblock_t fs_get_block)
1404 struct dentry *dentry = file->f_dentry;
1405 struct inode *inode = dentry->d_inode;
1406 unsigned long block;
1407 int err, created;
1408 unsigned long blocksize, start_block, end_block;
1409 unsigned long start_offset, start_bytes, end_bytes;
1410 unsigned long bbits, phys, blocks, i, len;
1411 struct buffer_head *bh, *head;
1412 char * target_buf;
1414 target_buf = (char *)page_address(page) + offset;
1415 lock_kernel();
1417 if (!PageLocked(page))
1418 BUG();
1420 blocksize = inode->i_sb->s_blocksize;
1421 if (!page->buffers)
1422 create_empty_buffers(page, inode, blocksize);
1423 head = page->buffers;
1425 bbits = inode->i_sb->s_blocksize_bits;
1426 block = page->offset >> bbits;
1427 blocks = PAGE_SIZE >> bbits;
1428 start_block = offset >> bbits;
1429 end_block = (offset + bytes - 1) >> bbits;
1430 start_offset = offset & (blocksize - 1);
1431 start_bytes = blocksize - start_offset;
1432 if (start_bytes > bytes)
1433 start_bytes = bytes;
1434 end_bytes = (offset+bytes) & (blocksize - 1);
1435 if (end_bytes > bytes)
1436 end_bytes = bytes;
1438 if (offset < 0 || offset >= PAGE_SIZE)
1439 BUG();
1440 if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1441 BUG();
1442 if (start_block < 0 || start_block >= blocks)
1443 BUG();
1444 if (end_block < 0 || end_block >= blocks)
1445 BUG();
1446 // FIXME: currently we assume page alignment.
1447 if (page->offset & (PAGE_SIZE-1))
1448 BUG();
1450 i = 0;
1451 bh = head;
1452 do {
1453 if (!bh)
1454 BUG();
1456 if ((i < start_block) || (i > end_block)) {
1457 goto skip;
1459 unlock_kernel();
1461 err = -EFAULT;
1462 if (start_offset) {
1463 len = start_bytes;
1464 start_offset = 0;
1465 } else
1466 if (end_bytes && (i == end_block)) {
1467 len = end_bytes;
1468 end_bytes = 0;
1469 } else {
1471 * Overwritten block.
1473 len = blocksize;
1475 if (copy_from_user(target_buf, buf, len))
1476 goto out_nolock;
1477 target_buf += len;
1478 buf += len;
1481 * we dirty buffers only after copying the data into
1482 * the page - this way we can dirty the buffer even if
1483 * the bh is still doing IO.
1485 lock_kernel();
1486 if (!bh->b_blocknr) {
1487 err = -EIO;
1488 down(&inode->i_sem);
1489 phys = fs_get_block (inode, block, 1, &err, &created);
1490 up(&inode->i_sem);
1491 if (!phys)
1492 goto out;
1494 init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
1497 * if partially written block which has contents on
1498 * disk, then we have to read it first.
1500 if (!created && (start_offset ||
1501 (end_bytes && (i == end_block)))) {
1502 bh->b_state = 0;
1503 ll_rw_block(READ, 1, &bh);
1504 wait_on_buffer(bh);
1505 err = -EIO;
1506 if (!buffer_uptodate(bh))
1507 goto out;
1510 bh->b_state = (1<<BH_Uptodate);
1511 } else {
1513 * block already exists, just mark it uptodate:
1515 bh->b_end_io = end_buffer_io_sync;
1516 set_bit(BH_Uptodate, &bh->b_state);
1518 mark_buffer_dirty(bh, 0);
1519 skip:
1520 i++;
1521 block++;
1522 bh = bh->b_this_page;
1523 } while (bh != head);
1524 unlock_kernel();
1526 SetPageUptodate(page);
1527 return bytes;
1528 out:
1529 unlock_kernel();
1530 out_nolock:
1531 ClearPageUptodate(page);
1532 return err;
1536 * Start I/O on a page.
1537 * This function expects the page to be locked and may return
1538 * before I/O is complete. You then have to check page->locked,
1539 * page->uptodate, and maybe wait on page->wait.
1541 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1543 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1544 int nr, fresh, block;
1546 if (!PageLocked(page))
1547 panic("brw_page: page not locked for I/O");
1548 // clear_bit(PG_error, &page->flags);
1550 * We pretty much rely on the page lock for this, because
1551 * create_page_buffers() might sleep.
1553 fresh = 0;
1554 if (!page->buffers) {
1555 create_page_buffers(rw, page, dev, b, size, bmap);
1556 fresh = 1;
1558 if (!page->buffers)
1559 BUG();
1560 page->owner = -1;
1562 head = page->buffers;
1563 bh = head;
1564 nr = 0;
1565 do {
1566 block = *(b++);
1568 if (fresh && (bh->b_count != 0))
1569 BUG();
1570 if (rw == READ) {
1571 if (!fresh)
1572 BUG();
1573 if (bmap && !block) {
1574 if (block)
1575 BUG();
1576 } else {
1577 if (bmap && !block)
1578 BUG();
1579 if (!buffer_uptodate(bh)) {
1580 arr[nr++] = bh;
1583 } else { /* WRITE */
1584 if (!bh->b_blocknr) {
1585 if (!block)
1586 BUG();
1587 bh->b_blocknr = block;
1588 } else {
1589 if (!block)
1590 BUG();
1592 set_bit(BH_Uptodate, &bh->b_state);
1593 mark_buffer_dirty(bh, 0);
1594 arr[nr++] = bh;
1596 bh = bh->b_this_page;
1597 } while (bh != head);
1598 if (rw == READ)
1599 ++current->maj_flt;
1600 if ((rw == READ) && nr) {
1601 if (Page_Uptodate(page))
1602 BUG();
1603 unlock_kernel();
1604 ll_rw_block(rw, nr, arr);
1605 lock_kernel();
1606 } else {
1607 if (!nr && rw == READ) {
1608 SetPageUptodate(page);
1609 page->owner = (int)current;
1610 UnlockPage(page);
1612 if (nr && (rw == WRITE)) {
1613 unlock_kernel();
1614 ll_rw_block(rw, nr, arr);
1615 lock_kernel();
1618 return 0;
1622 * This is called by end_request() when I/O has completed.
1624 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1626 if (on) {
1627 struct buffer_head *tmp = bh;
1628 struct page *page;
1629 set_bit(BH_Uptodate, &bh->b_state);
1630 /* If a page has buffers and all these buffers are uptodate,
1631 * then the page is uptodate. */
1632 do {
1633 if (!test_bit(BH_Uptodate, &tmp->b_state))
1634 return;
1635 tmp=tmp->b_this_page;
1636 } while (tmp && tmp != bh);
1637 page = mem_map + MAP_NR(bh->b_data);
1638 SetPageUptodate(page);
1639 return;
1641 clear_bit(BH_Uptodate, &bh->b_state);
1645 * Generic "readpage" function for block devices that have the normal
1646 * bmap functionality. This is most of the block device filesystems.
1647 * Reads the page asynchronously --- the unlock_buffer() and
1648 * mark_buffer_uptodate() functions propagate buffer state into the
1649 * page struct once IO has completed.
1651 int generic_readpage(struct file * file, struct page * page)
1653 struct dentry *dentry = file->f_dentry;
1654 struct inode *inode = dentry->d_inode;
1655 unsigned long block;
1656 int *p, nr[PAGE_SIZE/512];
1657 int i;
1659 if (page->buffers) {
1660 printk("hm, no brw_page(%p) because IO already started.\n",
1661 page);
1662 goto out;
1665 i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1666 block = page->offset >> inode->i_sb->s_blocksize_bits;
1667 p = nr;
1668 do {
1669 *p = inode->i_op->bmap(inode, block);
1670 i--;
1671 block++;
1672 p++;
1673 } while (i > 0);
1675 /* IO start */
1676 brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1677 out:
1678 return 0;
1682 * Try to increase the number of buffers available: the size argument
1683 * is used to determine what kind of buffers we want.
1685 static int grow_buffers(int size)
1687 unsigned long page;
1688 struct buffer_head *bh, *tmp;
1689 struct buffer_head * insert_point;
1690 int isize;
1692 if ((size & 511) || (size > PAGE_SIZE)) {
1693 printk("VFS: grow_buffers: size = %d\n",size);
1694 return 0;
1697 if (!(page = __get_free_page(GFP_BUFFER)))
1698 return 0;
1699 bh = create_buffers(page, size, 0);
1700 if (!bh) {
1701 free_page(page);
1702 return 0;
1705 isize = BUFSIZE_INDEX(size);
1706 insert_point = free_list[isize];
1708 tmp = bh;
1709 while (1) {
1710 if (insert_point) {
1711 tmp->b_next_free = insert_point->b_next_free;
1712 tmp->b_prev_free = insert_point;
1713 insert_point->b_next_free->b_prev_free = tmp;
1714 insert_point->b_next_free = tmp;
1715 } else {
1716 tmp->b_prev_free = tmp;
1717 tmp->b_next_free = tmp;
1719 insert_point = tmp;
1720 if (tmp->b_this_page)
1721 tmp = tmp->b_this_page;
1722 else
1723 break;
1725 tmp->b_this_page = bh;
1726 free_list[isize] = bh;
1727 mem_map[MAP_NR(page)].buffers = bh;
1728 buffermem += PAGE_SIZE;
1729 return 1;
1733 * Can the buffer be thrown out?
1735 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1736 #define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1739 * try_to_free_buffers() checks if all the buffers on this particular page
1740 * are unused, and free's the page if so.
1742 * Wake up bdflush() if this fails - if we're running low on memory due
1743 * to dirty buffers, we need to flush them out as quickly as possible.
1745 int try_to_free_buffers(struct page * page)
1747 struct buffer_head * tmp, * bh = page->buffers;
1749 tmp = bh;
1750 do {
1751 struct buffer_head * p = tmp;
1753 tmp = tmp->b_this_page;
1754 if (!buffer_busy(p))
1755 continue;
1757 wakeup_bdflush(0);
1758 return 0;
1759 } while (tmp != bh);
1761 tmp = bh;
1762 do {
1763 struct buffer_head * p = tmp;
1764 tmp = tmp->b_this_page;
1766 /* The buffer can be either on the regular queues or on the free list.. */
1767 if (p->b_dev == B_FREE)
1768 remove_from_free_list(p);
1769 else
1770 remove_from_queues(p);
1772 put_unused_buffer_head(p);
1773 } while (tmp != bh);
1775 /* Wake up anyone waiting for buffer heads */
1776 wake_up(&buffer_wait);
1778 /* And free the page */
1779 page->buffers = NULL;
1780 if (__free_page(page)) {
1781 buffermem -= PAGE_SIZE;
1782 return 1;
1784 return 0;
1787 /* ================== Debugging =================== */
1789 void show_buffers(void)
1791 struct buffer_head * bh;
1792 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1793 int protected = 0;
1794 int nlist;
1795 static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1797 printk("Buffer memory: %6dkB\n",buffermem>>10);
1798 printk("Buffer heads: %6d\n",nr_buffer_heads);
1799 printk("Buffer blocks: %6d\n",nr_buffers);
1800 printk("Buffer hashed: %6d\n",nr_hashed_buffers);
1802 for(nlist = 0; nlist < NR_LIST; nlist++) {
1803 found = locked = dirty = used = lastused = protected = 0;
1804 bh = lru_list[nlist];
1805 if(!bh) continue;
1807 do {
1808 found++;
1809 if (buffer_locked(bh))
1810 locked++;
1811 if (buffer_protected(bh))
1812 protected++;
1813 if (buffer_dirty(bh))
1814 dirty++;
1815 if (bh->b_count)
1816 used++, lastused = found;
1817 bh = bh->b_next_free;
1818 } while (bh != lru_list[nlist]);
1819 printk("%8s: %d buffers, %d used (last=%d), "
1820 "%d locked, %d protected, %d dirty\n",
1821 buf_types[nlist], found, used, lastused,
1822 locked, protected, dirty);
1827 /* ===================== Init ======================= */
1830 * allocate the hash table and init the free list
1831 * Use gfp() for the hash table to decrease TLB misses, use
1832 * SLAB cache for buffer heads.
1834 void __init buffer_init(unsigned long memory_size)
1836 int order;
1837 unsigned int nr_hash;
1839 /* we need to guess at the right sort of size for a buffer cache.
1840 the heuristic from working with large databases and getting
1841 fsync times (ext2) manageable, is the following */
1843 memory_size >>= 22;
1844 for (order = 5; (1UL << order) < memory_size; order++);
1846 /* try to allocate something until we get it or we're asking
1847 for something that is really too small */
1849 do {
1850 nr_hash = (1UL << order) * PAGE_SIZE /
1851 sizeof(struct buffer_head *);
1852 hash_table = (struct buffer_head **)
1853 __get_free_pages(GFP_ATOMIC, order);
1854 } while (hash_table == NULL && --order > 4);
1855 printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (1UL<<order) * PAGE_SIZE);
1857 if (!hash_table)
1858 panic("Failed to allocate buffer hash table\n");
1859 memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1860 bh_hash_mask = nr_hash-1;
1862 bh_cachep = kmem_cache_create("buffer_head",
1863 sizeof(struct buffer_head),
1865 SLAB_HWCACHE_ALIGN, NULL, NULL);
1866 if(!bh_cachep)
1867 panic("Cannot create buffer head SLAB cache\n");
1869 * Allocate the reserved buffer heads.
1871 while (nr_buffer_heads < NR_RESERVED) {
1872 struct buffer_head * bh;
1874 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1875 if (!bh)
1876 break;
1877 put_unused_buffer_head(bh);
1878 nr_buffer_heads++;
1881 lru_list[BUF_CLEAN] = 0;
1882 grow_buffers(BLOCK_SIZE);
1886 /* ====================== bdflush support =================== */
1888 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1889 * response to dirty buffers. Once this process is activated, we write back
1890 * a limited number of buffers to the disks and then go back to sleep again.
1892 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1893 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1894 struct task_struct *bdflush_tsk = 0;
1896 void wakeup_bdflush(int wait)
1898 if (current == bdflush_tsk)
1899 return;
1900 if (wait)
1901 run_task_queue(&tq_disk);
1902 wake_up(&bdflush_wait);
1903 if (wait)
1904 sleep_on(&bdflush_done);
1909 * Here we attempt to write back old buffers. We also try to flush inodes
1910 * and supers as well, since this function is essentially "update", and
1911 * otherwise there would be no way of ensuring that these quantities ever
1912 * get written back. Ideally, we would have a timestamp on the inodes
1913 * and superblocks so that we could write back only the old ones as well
1916 static int sync_old_buffers(void)
1918 int i;
1919 int ndirty, nwritten;
1920 int nlist;
1921 int ncount;
1922 struct buffer_head * bh, *next;
1924 sync_supers(0);
1925 sync_inodes(0);
1927 ncount = 0;
1928 #ifdef DEBUG
1929 for(nlist = 0; nlist < NR_LIST; nlist++)
1930 #else
1931 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1932 #endif
1934 ndirty = 0;
1935 nwritten = 0;
1936 repeat:
1938 bh = lru_list[nlist];
1939 if(bh)
1940 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1941 /* We may have stalled while waiting for I/O to complete. */
1942 if(bh->b_list != nlist) goto repeat;
1943 next = bh->b_next_free;
1944 if(!lru_list[nlist]) {
1945 printk("Dirty list empty %d\n", i);
1946 break;
1949 /* Clean buffer on dirty list? Refile it */
1950 if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
1951 refile_buffer(bh);
1952 continue;
1955 /* Unlocked buffer on locked list? Refile it */
1956 if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1957 refile_buffer(bh);
1958 continue;
1961 if (buffer_locked(bh) || !buffer_dirty(bh))
1962 continue;
1963 ndirty++;
1964 if(time_before(jiffies, bh->b_flushtime))
1965 continue;
1966 nwritten++;
1967 next->b_count++;
1968 bh->b_count++;
1969 bh->b_flushtime = 0;
1970 #ifdef DEBUG
1971 if(nlist != BUF_DIRTY) ncount++;
1972 #endif
1973 ll_rw_block(WRITE, 1, &bh);
1974 bh->b_count--;
1975 next->b_count--;
1978 run_task_queue(&tq_disk);
1979 #ifdef DEBUG
1980 if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1981 printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1982 #endif
1983 run_task_queue(&tq_disk);
1984 return 0;
1988 /* This is the interface to bdflush. As we get more sophisticated, we can
1989 * pass tuning parameters to this "process", to adjust how it behaves.
1990 * We would want to verify each parameter, however, to make sure that it
1991 * is reasonable. */
1993 asmlinkage int sys_bdflush(int func, long data)
1995 int i, error = -EPERM;
1997 lock_kernel();
1998 if (!capable(CAP_SYS_ADMIN))
1999 goto out;
2001 if (func == 1) {
2002 error = sync_old_buffers();
2003 goto out;
2006 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2007 if (func >= 2) {
2008 i = (func-2) >> 1;
2009 error = -EINVAL;
2010 if (i < 0 || i >= N_PARAM)
2011 goto out;
2012 if((func & 1) == 0) {
2013 error = put_user(bdf_prm.data[i], (int*)data);
2014 goto out;
2016 if (data < bdflush_min[i] || data > bdflush_max[i])
2017 goto out;
2018 bdf_prm.data[i] = data;
2019 error = 0;
2020 goto out;
2023 /* Having func 0 used to launch the actual bdflush and then never
2024 * return (unless explicitly killed). We return zero here to
2025 * remain semi-compatible with present update(8) programs.
2027 error = 0;
2028 out:
2029 unlock_kernel();
2030 return error;
2033 /* This is the actual bdflush daemon itself. It used to be started from
2034 * the syscall above, but now we launch it ourselves internally with
2035 * kernel_thread(...) directly after the first thread in init/main.c */
2037 /* To prevent deadlocks for a loop device:
2038 * 1) Do non-blocking writes to loop (avoids deadlock with running
2039 * out of request blocks).
2040 * 2) But do a blocking write if the only dirty buffers are loop buffers
2041 * (otherwise we go into an infinite busy-loop).
2042 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
2043 * with running out of free buffers for loop's "real" device).
2045 int bdflush(void * unused)
2047 int i;
2048 int ndirty;
2049 int nlist;
2050 int ncount;
2051 struct buffer_head * bh, *next;
2052 int major;
2053 int wrta_cmd = WRITEA; /* non-blocking write for LOOP */
2056 * We have a bare-bones task_struct, and really should fill
2057 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2058 * display semi-sane things. Not real crucial though...
2061 current->session = 1;
2062 current->pgrp = 1;
2063 sprintf(current->comm, "kflushd");
2064 bdflush_tsk = current;
2067 * As a kernel thread we want to tamper with system buffers
2068 * and other internals and thus be subject to the SMP locking
2069 * rules. (On a uniprocessor box this does nothing).
2071 lock_kernel();
2073 for (;;) {
2074 #ifdef DEBUG
2075 printk("bdflush() activated...");
2076 #endif
2078 CHECK_EMERGENCY_SYNC
2080 ncount = 0;
2081 #ifdef DEBUG
2082 for(nlist = 0; nlist < NR_LIST; nlist++)
2083 #else
2084 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
2085 #endif
2087 ndirty = 0;
2088 repeat:
2090 bh = lru_list[nlist];
2091 if(bh)
2092 for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
2093 bh = next) {
2094 /* We may have stalled while waiting for I/O to complete. */
2095 if(bh->b_list != nlist) goto repeat;
2096 next = bh->b_next_free;
2097 if(!lru_list[nlist]) {
2098 printk("Dirty list empty %d\n", i);
2099 break;
2102 /* Clean buffer on dirty list? Refile it */
2103 if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
2104 refile_buffer(bh);
2105 continue;
2108 /* Unlocked buffer on locked list? Refile it */
2109 if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
2110 refile_buffer(bh);
2111 continue;
2114 if (buffer_locked(bh) || !buffer_dirty(bh))
2115 continue;
2116 major = MAJOR(bh->b_dev);
2117 /* Should we write back buffers that are shared or not??
2118 currently dirty buffers are not shared, so it does not matter */
2119 next->b_count++;
2120 bh->b_count++;
2121 ndirty++;
2122 bh->b_flushtime = 0;
2123 if (major == LOOP_MAJOR) {
2124 ll_rw_block(wrta_cmd,1, &bh);
2125 wrta_cmd = WRITEA;
2126 if (buffer_dirty(bh))
2127 --ndirty;
2129 else
2130 ll_rw_block(WRITE, 1, &bh);
2131 #ifdef DEBUG
2132 if(nlist != BUF_DIRTY) ncount++;
2133 #endif
2134 bh->b_count--;
2135 next->b_count--;
2136 wake_up(&buffer_wait);
2139 #ifdef DEBUG
2140 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
2141 printk("sleeping again.\n");
2142 #endif
2143 /* If we didn't write anything, but there are still
2144 * dirty buffers, then make the next write to a
2145 * loop device to be a blocking write.
2146 * This lets us block--which we _must_ do! */
2147 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
2148 wrta_cmd = WRITE;
2149 continue;
2151 run_task_queue(&tq_disk);
2152 wake_up(&bdflush_done);
2154 /* If there are still a lot of dirty buffers around, skip the sleep
2155 and flush some more */
2156 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
2157 spin_lock_irq(&current->sigmask_lock);
2158 flush_signals(current);
2159 spin_unlock_irq(&current->sigmask_lock);
2161 interruptible_sleep_on(&bdflush_wait);