4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
27 #include <linux/malloc.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/swap.h>
31 #include <linux/swapctl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/vmalloc.h>
34 #include <linux/blkdev.h>
35 #include <linux/sysrq.h>
36 #include <linux/file.h>
37 #include <linux/init.h>
38 #include <linux/quotaops.h>
40 #include <asm/uaccess.h>
42 #include <asm/bitops.h>
45 static char buffersize_index
[65] =
46 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
47 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
48 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
56 number of unused buffer heads */
61 static unsigned long bh_hash_mask
= 0;
63 static int grow_buffers(int size
);
65 static struct buffer_head
** hash_table
;
66 static struct buffer_head
* lru_list
[NR_LIST
] = {NULL
, };
67 static struct buffer_head
* free_list
[NR_SIZES
] = {NULL
, };
69 static kmem_cache_t
*bh_cachep
;
71 static struct buffer_head
* unused_list
= NULL
;
72 static struct buffer_head
* reuse_list
= NULL
;
73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait
);
75 static int nr_buffers
= 0;
76 static int nr_buffers_type
[NR_LIST
] = {0,};
77 static int nr_buffer_heads
= 0;
78 static int nr_unused_buffer_heads
= 0;
79 static int nr_hashed_buffers
= 0;
81 /* This is used by some architectures to estimate available memory. */
84 /* Here is the parameter block for the bdflush process. If you add or
85 * remove any of the parameters, make sure to update kernel/sysctl.c.
90 /* The dummy values in this structure are left in there for compatibility
91 * with old programs that play with the /proc entries.
95 int nfract
; /* Percentage of buffer cache dirty to
97 int ndirty
; /* Maximum number of dirty blocks to write out per
99 int nrefill
; /* Number of clean buffers to try to obtain
100 each time we call refill */
101 int nref_dirt
; /* Dirty buffer threshold for activating bdflush
102 when trying to refill buffers. */
103 int interval
; /* Interval (seconds) between spontaneous
105 int age_buffer
; /* Time for normal buffer to age before
107 int age_super
; /* Time for superblock to age before we
109 int dummy2
; /* unused */
110 int dummy3
; /* unused */
112 unsigned int data
[N_PARAM
];
113 } bdf_prm
= {{40, 500, 64, 256, 5, 30*HZ
, 5*HZ
, 1884, 2}};
115 /* These are the min and max parameter values that we will allow to be assigned */
116 int bdflush_min
[N_PARAM
] = { 0, 10, 5, 25, 1, 1*HZ
, 1*HZ
, 1, 1};
117 int bdflush_max
[N_PARAM
] = {100,5000, 2000, 2000,100, 600*HZ
, 600*HZ
, 2047, 5};
119 void wakeup_bdflush(int);
122 * Rewrote the wait-routines to use the "new" wait-queue functionality,
123 * and getting rid of the cli-sti pairs. The wait-queue routines still
124 * need cli-sti, but now it's just a couple of 386 instructions or so.
126 * Note that the real wait_on_buffer() is an inline function that checks
127 * if 'b_wait' is set before calling this, so that the queues aren't set
130 void __wait_on_buffer(struct buffer_head
* bh
)
132 struct task_struct
*tsk
= current
;
133 DECLARE_WAITQUEUE(wait
, tsk
);
136 add_wait_queue(&bh
->b_wait
, &wait
);
138 tsk
->state
= TASK_UNINTERRUPTIBLE
;
139 run_task_queue(&tq_disk
);
140 if (buffer_locked(bh
)) {
144 tsk
->state
= TASK_RUNNING
;
145 remove_wait_queue(&bh
->b_wait
, &wait
);
149 /* Call sync_buffers with wait!=0 to ensure that the call does not
150 * return until all buffer writes have completed. Sync() may return
151 * before the writes have finished; fsync() may not.
154 /* Godamity-damn. Some buffers (bitmaps for filesystems)
155 * spontaneously dirty themselves without ever brelse being called.
156 * We will ultimately want to put these in a separate list, but for
157 * now we search all of the lists for dirty buffers.
159 static int sync_buffers(kdev_t dev
, int wait
)
161 int i
, retry
, pass
= 0, err
= 0;
162 struct buffer_head
* bh
, *next
;
164 /* One pass for no-wait, three for wait:
165 * 0) write out all dirty, unlocked buffers;
166 * 1) write out all dirty buffers, waiting if locked;
167 * 2) wait for completion by waiting for all buffers to unlock.
172 /* We search all lists as a failsafe mechanism, not because we expect
173 * there to be dirty buffers on any of the other lists.
175 bh
= lru_list
[BUF_DIRTY
];
178 for (i
= nr_buffers_type
[BUF_DIRTY
]*2 ; i
-- > 0 ; bh
= next
) {
179 if (bh
->b_list
!= BUF_DIRTY
)
181 next
= bh
->b_next_free
;
182 if (!lru_list
[BUF_DIRTY
])
184 if (dev
&& bh
->b_dev
!= dev
)
186 if (buffer_locked(bh
)) {
187 /* Buffer is locked; skip it unless wait is
188 * requested AND pass > 0.
190 if (!wait
|| !pass
) {
198 /* If an unlocked buffer is not uptodate, there has
199 * been an IO error. Skip it.
201 if (wait
&& buffer_req(bh
) && !buffer_locked(bh
) &&
202 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) {
207 /* Don't write clean buffers. Don't write ANY buffers
210 if (!buffer_dirty(bh
) || pass
>= 2)
213 /* Don't bother about locked buffers.
215 * XXX We checked if it was locked above and there is no
216 * XXX way we could have slept in between. -DaveM
218 if (buffer_locked(bh
))
223 ll_rw_block(WRITE
, 1, &bh
);
230 bh
= lru_list
[BUF_LOCKED
];
233 for (i
= nr_buffers_type
[BUF_LOCKED
]*2 ; i
-- > 0 ; bh
= next
) {
234 if (bh
->b_list
!= BUF_LOCKED
)
236 next
= bh
->b_next_free
;
237 if (!lru_list
[BUF_LOCKED
])
239 if (dev
&& bh
->b_dev
!= dev
)
241 if (buffer_locked(bh
)) {
242 /* Buffer is locked; skip it unless wait is
243 * requested AND pass > 0.
245 if (!wait
|| !pass
) {
254 /* If we are waiting for the sync to succeed, and if any dirty
255 * blocks were written, then repeat; on the second pass, only
256 * wait for buffers being written (do not pass to write any
257 * more buffers on the second pass).
259 } while (wait
&& retry
&& ++pass
<=2);
263 void sync_dev(kdev_t dev
)
265 sync_buffers(dev
, 0);
268 sync_buffers(dev
, 0);
271 * FIXME(eric) we need to sync the physical devices here.
272 * This is because some (scsi) controllers have huge amounts of
273 * cache onboard (hundreds of Mb), and we need to instruct
274 * them to commit all of the dirty memory to disk, and we should
275 * not return until this has happened.
277 * This would need to get implemented by going through the assorted
278 * layers so that each block major number can be synced, and this
279 * would call down into the upper and mid-layer scsi.
283 int fsync_dev(kdev_t dev
)
285 sync_buffers(dev
, 0);
289 return sync_buffers(dev
, 1);
292 asmlinkage
int sys_sync(void)
301 * filp may be NULL if called via the msync of a vma.
304 int file_fsync(struct file
*filp
, struct dentry
*dentry
)
306 struct inode
* inode
= dentry
->d_inode
;
307 struct super_block
* sb
;
310 /* sync the inode to buffers */
311 write_inode_now(inode
);
313 /* sync the superblock to buffers */
316 if (sb
->s_op
&& sb
->s_op
->write_super
)
317 sb
->s_op
->write_super(sb
);
319 /* .. finally sync the buffers to disk */
321 return sync_buffers(dev
, 1);
324 asmlinkage
int sys_fsync(unsigned int fd
)
327 struct dentry
* dentry
;
328 struct inode
* inode
;
337 dentry
= file
->f_dentry
;
341 inode
= dentry
->d_inode
;
346 if (!file
->f_op
|| !file
->f_op
->fsync
)
349 /* We need to protect against concurrent writers.. */
351 err
= file
->f_op
->fsync(file
, dentry
);
361 asmlinkage
int sys_fdatasync(unsigned int fd
)
364 struct dentry
* dentry
;
365 struct inode
* inode
;
374 dentry
= file
->f_dentry
;
378 inode
= dentry
->d_inode
;
383 if (!file
->f_op
|| !file
->f_op
->fsync
)
386 /* this needs further work, at the moment it is identical to fsync() */
388 err
= file
->f_op
->fsync(file
, dentry
);
398 void invalidate_buffers(kdev_t dev
)
402 struct buffer_head
* bh
;
404 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
405 bh
= lru_list
[nlist
];
406 for (i
= nr_buffers_type
[nlist
]*2 ; --i
> 0 ; bh
= bh
->b_next_free
) {
407 if (bh
->b_dev
!= dev
)
410 if (bh
->b_dev
!= dev
)
415 clear_bit(BH_Protected
, &bh
->b_state
);
416 clear_bit(BH_Uptodate
, &bh
->b_state
);
417 clear_bit(BH_Dirty
, &bh
->b_state
);
418 clear_bit(BH_Req
, &bh
->b_state
);
423 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
424 #define hash(dev,block) hash_table[_hashfn(dev,block)]
426 static inline void remove_from_hash_queue(struct buffer_head
* bh
)
428 struct buffer_head
**pprev
= bh
->b_pprev
;
430 struct buffer_head
* next
= bh
->b_next
;
432 next
->b_pprev
= pprev
;
441 static inline void remove_from_lru_list(struct buffer_head
* bh
)
443 if (!(bh
->b_prev_free
) || !(bh
->b_next_free
))
444 panic("VFS: LRU block list corrupted");
445 if (bh
->b_dev
== B_FREE
)
446 panic("LRU list corrupted");
447 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
448 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
450 if (lru_list
[bh
->b_list
] == bh
)
451 lru_list
[bh
->b_list
] = bh
->b_next_free
;
452 if (lru_list
[bh
->b_list
] == bh
)
453 lru_list
[bh
->b_list
] = NULL
;
454 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
457 static inline void remove_from_free_list(struct buffer_head
* bh
)
459 int isize
= BUFSIZE_INDEX(bh
->b_size
);
460 if (!(bh
->b_prev_free
) || !(bh
->b_next_free
))
461 panic("VFS: Free block list corrupted");
462 if(bh
->b_dev
!= B_FREE
)
463 panic("Free list corrupted");
464 if(!free_list
[isize
])
465 panic("Free list empty");
466 if(bh
->b_next_free
== bh
)
467 free_list
[isize
] = NULL
;
469 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
470 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
471 if (free_list
[isize
] == bh
)
472 free_list
[isize
] = bh
->b_next_free
;
474 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
477 static void remove_from_queues(struct buffer_head
* bh
)
479 if(bh
->b_dev
== B_FREE
) {
480 remove_from_free_list(bh
); /* Free list entries should not be
484 nr_buffers_type
[bh
->b_list
]--;
485 remove_from_hash_queue(bh
);
486 remove_from_lru_list(bh
);
489 static inline void put_last_free(struct buffer_head
* bh
)
492 struct buffer_head
**bhp
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)];
494 bh
->b_dev
= B_FREE
; /* So it is obvious we are on the free list. */
496 /* Add to back of free list. */
499 bh
->b_prev_free
= bh
;
502 bh
->b_next_free
= *bhp
;
503 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
504 (*bhp
)->b_prev_free
->b_next_free
= bh
;
505 (*bhp
)->b_prev_free
= bh
;
509 static void insert_into_queues(struct buffer_head
* bh
)
511 /* put at end of free list */
512 if(bh
->b_dev
== B_FREE
) {
515 struct buffer_head
**bhp
= &lru_list
[bh
->b_list
];
519 bh
->b_prev_free
= bh
;
523 panic("VFS: buffer LRU pointers corrupted");
525 bh
->b_next_free
= *bhp
;
526 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
527 (*bhp
)->b_prev_free
->b_next_free
= bh
;
528 (*bhp
)->b_prev_free
= bh
;
530 nr_buffers_type
[bh
->b_list
]++;
532 /* Put the buffer in new hash-queue if it has a device. */
536 struct buffer_head
**bhp
= &hash(bh
->b_dev
, bh
->b_blocknr
);
537 struct buffer_head
*next
= *bhp
;
541 next
->b_pprev
= &bh
->b_next
;
550 struct buffer_head
* find_buffer(kdev_t dev
, int block
, int size
)
552 struct buffer_head
* next
;
554 next
= hash(dev
,block
);
556 struct buffer_head
*tmp
= next
;
560 if (tmp
->b_blocknr
!= block
|| tmp
->b_size
!= size
|| tmp
->b_dev
!= dev
)
569 * Why like this, I hear you say... The reason is race-conditions.
570 * As we don't lock buffers (unless we are reading them, that is),
571 * something might happen to it while we sleep (ie a read-error
572 * will force it bad). This shouldn't really happen currently, but
575 struct buffer_head
* get_hash_table(kdev_t dev
, int block
, int size
)
577 struct buffer_head
* bh
;
578 bh
= find_buffer(dev
,block
,size
);
584 unsigned int get_hardblocksize(kdev_t dev
)
587 * Get the hard sector size for the given device. If we don't know
588 * what it is, return 0.
590 if (hardsect_size
[MAJOR(dev
)] != NULL
) {
591 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)];
597 * We don't know what the hardware sector size for this device is.
598 * Return 0 indicating that we don't know.
603 void set_blocksize(kdev_t dev
, int size
)
605 extern int *blksize_size
[];
607 struct buffer_head
* bh
, *bhnext
;
609 if (!blksize_size
[MAJOR(dev
)])
612 /* Size must be a power of two, and between 512 and PAGE_SIZE */
613 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
614 panic("Invalid blocksize passed to set_blocksize");
616 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == 0 && size
== BLOCK_SIZE
) {
617 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
620 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
)
622 sync_buffers(dev
, 2);
623 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
625 /* We need to be quite careful how we do this - we are moving entries
626 * around on the free list, and we can get in a loop if we are not careful.
628 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
629 bh
= lru_list
[nlist
];
630 for (i
= nr_buffers_type
[nlist
]*2 ; --i
> 0 ; bh
= bhnext
) {
634 bhnext
= bh
->b_next_free
;
635 if (bh
->b_dev
!= dev
)
637 if (bh
->b_size
== size
)
642 if (bh
->b_dev
== dev
&& bh
->b_size
!= size
) {
643 clear_bit(BH_Dirty
, &bh
->b_state
);
644 clear_bit(BH_Uptodate
, &bh
->b_state
);
645 clear_bit(BH_Req
, &bh
->b_state
);
648 remove_from_hash_queue(bh
);
654 * We used to try various strange things. Let's not.
656 static void refill_freelist(int size
)
658 if (!grow_buffers(size
)) {
660 current
->policy
|= SCHED_YIELD
;
665 void init_buffer(struct buffer_head
*bh
, kdev_t dev
, int block
,
666 bh_end_io_t
*handler
, void *dev_id
)
669 bh
->b_list
= BUF_CLEAN
;
672 bh
->b_blocknr
= block
;
673 bh
->b_end_io
= handler
;
674 bh
->b_dev_id
= dev_id
;
677 static void end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
679 mark_buffer_uptodate(bh
, uptodate
);
684 * Ok, this is getblk, and it isn't very clear, again to hinder
685 * race-conditions. Most of the code is seldom used, (ie repeating),
686 * so it should be much more efficient than it looks.
688 * The algorithm is changed: hopefully better, and an elusive bug removed.
690 * 14.02.92: changed it to sync dirty buffers a bit: better performance
691 * when the filesystem starts to get full of dirty blocks (I hope).
693 struct buffer_head
* getblk(kdev_t dev
, int block
, int size
)
695 struct buffer_head
* bh
;
699 bh
= get_hash_table(dev
, block
, size
);
701 if (!buffer_dirty(bh
)) {
707 isize
= BUFSIZE_INDEX(size
);
709 bh
= free_list
[isize
];
712 remove_from_free_list(bh
);
714 /* OK, FINALLY we know that this buffer is the only one of its kind,
715 * and that it's unused (b_count=0), unlocked, and clean.
717 init_buffer(bh
, dev
, block
, end_buffer_io_sync
, NULL
);
719 insert_into_queues(bh
);
723 * If we block while refilling the free list, somebody may
724 * create the buffer first ... search the hashes again.
727 refill_freelist(size
);
728 if (!find_buffer(dev
,block
,size
))
733 void set_writetime(struct buffer_head
* buf
, int flag
)
737 if (buffer_dirty(buf
)) {
738 /* Move buffer to dirty list if jiffies is clear. */
739 newtime
= jiffies
+ (flag
? bdf_prm
.b_un
.age_super
:
740 bdf_prm
.b_un
.age_buffer
);
741 if(!buf
->b_flushtime
|| buf
->b_flushtime
> newtime
)
742 buf
->b_flushtime
= newtime
;
744 buf
->b_flushtime
= 0;
750 * Put a buffer into the appropriate list, without side-effects.
752 static inline void file_buffer(struct buffer_head
*bh
, int list
)
754 remove_from_queues(bh
);
756 insert_into_queues(bh
);
760 * A buffer may need to be moved from one buffer list to another
761 * (e.g. in case it is not shared any more). Handle this.
763 void refile_buffer(struct buffer_head
* buf
)
767 if(buf
->b_dev
== B_FREE
) {
768 printk("Attempt to refile free buffer\n");
771 if (buffer_dirty(buf
))
773 else if (buffer_locked(buf
))
774 dispose
= BUF_LOCKED
;
777 if(dispose
!= buf
->b_list
) {
778 file_buffer(buf
, dispose
);
779 if(dispose
== BUF_DIRTY
) {
780 int too_many
= (nr_buffers
* bdf_prm
.b_un
.nfract
/100);
782 /* This buffer is dirty, maybe we need to start flushing.
783 * If too high a percentage of the buffers are dirty...
785 if (nr_buffers_type
[BUF_DIRTY
] > too_many
)
788 /* If this is a loop device, and
789 * more than half of the buffers are dirty...
790 * (Prevents no-free-buffers deadlock with loop device.)
792 if (MAJOR(buf
->b_dev
) == LOOP_MAJOR
&&
793 nr_buffers_type
[BUF_DIRTY
]*2>nr_buffers
)
800 * Release a buffer head
802 void __brelse(struct buffer_head
* buf
)
804 /* If dirty, mark the time this buffer should be written back. */
805 set_writetime(buf
, 0);
813 printk("VFS: brelse: Trying to free free buffer\n");
817 * bforget() is like brelse(), except it puts the buffer on the
818 * free list if it can.. We can NOT free the buffer if:
819 * - there are other users of it
820 * - it is locked and thus can have active IO
822 void __bforget(struct buffer_head
* buf
)
824 if (buf
->b_count
!= 1 || buffer_locked(buf
)) {
830 remove_from_queues(buf
);
835 * bread() reads a specified block and returns the buffer that contains
836 * it. It returns NULL if the block was unreadable.
838 struct buffer_head
* bread(kdev_t dev
, int block
, int size
)
840 struct buffer_head
* bh
;
842 bh
= getblk(dev
, block
, size
);
843 if (buffer_uptodate(bh
))
845 ll_rw_block(READ
, 1, &bh
);
847 if (buffer_uptodate(bh
))
854 * Ok, breada can be used as bread, but additionally to mark other
855 * blocks for reading as well. End the argument list with a negative
861 struct buffer_head
* breada(kdev_t dev
, int block
, int bufsize
,
862 unsigned int pos
, unsigned int filesize
)
864 struct buffer_head
* bhlist
[NBUF
];
866 struct buffer_head
* bh
;
876 bh
= getblk(dev
, block
, bufsize
);
877 index
= BUFSIZE_INDEX(bh
->b_size
);
879 if (buffer_uptodate(bh
))
881 else ll_rw_block(READ
, 1, &bh
);
883 blocks
= (filesize
- pos
) >> (9+index
);
885 if (blocks
< (read_ahead
[MAJOR(dev
)] >> index
))
886 blocks
= read_ahead
[MAJOR(dev
)] >> index
;
890 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
895 for(i
=1; i
<blocks
; i
++) {
896 bh
= getblk(dev
,block
+i
,bufsize
);
897 if (buffer_uptodate(bh
)) {
901 else bhlist
[j
++] = bh
;
904 /* Request the read for these buffers, and then release them. */
906 ll_rw_block(READA
, (j
-1), bhlist
+1);
910 /* Wait for this buffer, and then continue on. */
913 if (buffer_uptodate(bh
))
920 * Note: the caller should wake up the buffer_wait list if needed.
922 static void put_unused_buffer_head(struct buffer_head
* bh
)
924 if (nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) {
926 kmem_cache_free(bh_cachep
, bh
);
930 memset(bh
,0,sizeof(*bh
));
931 init_waitqueue_head(&bh
->b_wait
);
932 nr_unused_buffer_heads
++;
933 bh
->b_next_free
= unused_list
;
938 * We can't put completed temporary IO buffer_heads directly onto the
939 * unused_list when they become unlocked, since the device driver
940 * end_request routines still expect access to the buffer_head's
941 * fields after the final unlock. So, the device driver puts them on
942 * the reuse_list instead once IO completes, and we recover these to
943 * the unused_list here.
945 * Note that we don't do a wakeup here, but return a flag indicating
946 * whether we got any buffer heads. A task ready to sleep can check
947 * the returned value, and any tasks already sleeping will have been
948 * awakened when the buffer heads were added to the reuse list.
950 static inline int recover_reusable_buffer_heads(void)
952 struct buffer_head
*head
= xchg(&reuse_list
, NULL
);
957 struct buffer_head
*bh
= head
;
958 head
= head
->b_next_free
;
959 put_unused_buffer_head(bh
);
967 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
968 * no-buffer-head deadlock. Return NULL on failure; waiting for
969 * buffer heads is now handled in create_buffers().
971 static struct buffer_head
* get_unused_buffer_head(int async
)
973 struct buffer_head
* bh
;
975 recover_reusable_buffer_heads();
976 if (nr_unused_buffer_heads
> NR_RESERVED
) {
978 unused_list
= bh
->b_next_free
;
979 nr_unused_buffer_heads
--;
983 /* This is critical. We can't swap out pages to get
984 * more buffer heads, because the swap-out may need
985 * more buffer-heads itself. Thus SLAB_BUFFER.
987 if((bh
= kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) {
988 memset(bh
, 0, sizeof(*bh
));
989 init_waitqueue_head(&bh
->b_wait
);
995 * If we need an async buffer, use the reserved buffer heads.
997 if (async
&& unused_list
) {
999 unused_list
= bh
->b_next_free
;
1000 nr_unused_buffer_heads
--;
1006 * (Pending further analysis ...)
1007 * Ordinary (non-async) requests can use a different memory priority
1008 * to free up pages. Any swapping thus generated will use async
1012 (bh
= kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) {
1013 memset(bh
, 0, sizeof(*bh
));
1014 init_waitqueue_head(&bh
->b_wait
);
1024 * Create the appropriate buffers when given a page for data area and
1025 * the size of each buffer.. Use the bh->b_this_page linked list to
1026 * follow the buffers created. Return NULL if unable to create more
1028 * The async flag is used to differentiate async IO (paging, swapping)
1029 * from ordinary buffer allocations, and only async requests are allowed
1030 * to sleep waiting for buffer heads.
1032 static struct buffer_head
* create_buffers(unsigned long page
,
1033 unsigned long size
, int async
)
1035 DECLARE_WAITQUEUE(wait
, current
);
1036 struct buffer_head
*bh
, *head
;
1042 while ((offset
-= size
) >= 0) {
1043 bh
= get_unused_buffer_head(async
);
1047 bh
->b_dev
= B_FREE
; /* Flag as unused */
1048 bh
->b_this_page
= head
;
1052 bh
->b_next_free
= NULL
;
1056 bh
->b_data
= (char *) (page
+offset
);
1061 * In case anything failed, we just free everything we got.
1067 head
= head
->b_this_page
;
1068 put_unused_buffer_head(bh
);
1071 /* Wake up any waiters ... */
1072 wake_up(&buffer_wait
);
1076 * Return failure for non-async IO requests. Async IO requests
1077 * are not allowed to fail, so we have to wait until buffer heads
1078 * become available. But we don't want tasks sleeping with
1079 * partially complete buffers, so all were released above.
1084 /* We're _really_ low on memory. Now we just
1085 * wait for old buffer heads to become free due to
1086 * finishing IO. Since this is an async request and
1087 * the reserve list is empty, we're sure there are
1088 * async buffer heads in use.
1090 run_task_queue(&tq_disk
);
1093 * Set our state for sleeping, then check again for buffer heads.
1094 * This ensures we won't miss a wake_up from an interrupt.
1096 add_wait_queue(&buffer_wait
, &wait
);
1097 current
->state
= TASK_UNINTERRUPTIBLE
;
1098 if (!recover_reusable_buffer_heads())
1100 remove_wait_queue(&buffer_wait
, &wait
);
1101 current
->state
= TASK_RUNNING
;
1105 /* Run the hooks that have to be done when a page I/O has completed. */
1106 static inline void after_unlock_page (struct page
* page
)
1108 if (test_and_clear_bit(PG_decr_after
, &page
->flags
)) {
1109 atomic_dec(&nr_async_pages
);
1111 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1112 (char *) page_address(page
),
1113 atomic_read(&nr_async_pages
));
1116 if (test_and_clear_bit(PG_swap_unlock_after
, &page
->flags
))
1117 swap_after_unlock_page(page
->offset
);
1118 if (test_and_clear_bit(PG_free_after
, &page
->flags
))
1123 * Free all temporary buffers belonging to a page.
1124 * This needs to be called with interrupts disabled.
1126 static inline void free_async_buffers (struct buffer_head
* bh
)
1128 struct buffer_head
*tmp
, *tail
;
1131 * Link all the buffers into the b_next_free list,
1132 * so we only have to do one xchg() operation ...
1135 while ((tmp
= tail
->b_this_page
) != bh
) {
1136 tail
->b_next_free
= tmp
;
1140 /* Update the reuse list */
1141 tail
->b_next_free
= xchg(&reuse_list
, NULL
);
1144 /* Wake up any waiters ... */
1145 wake_up(&buffer_wait
);
1148 static void end_buffer_io_async(struct buffer_head
* bh
, int uptodate
)
1150 unsigned long flags
;
1151 struct buffer_head
*tmp
;
1154 mark_buffer_uptodate(bh
, uptodate
);
1157 /* This is a temporary buffer used for page I/O. */
1158 page
= mem_map
+ MAP_NR(bh
->b_data
);
1159 if (!PageLocked(page
))
1161 if (bh
->b_count
!= 1)
1164 if (!test_bit(BH_Uptodate
, &bh
->b_state
))
1165 set_bit(PG_error
, &page
->flags
);
1168 * Be _very_ careful from here on. Bad things can happen if
1169 * two buffer heads end IO at almost the same time and both
1170 * decide that the page is now completely done.
1172 * Async buffer_heads are here only as labels for IO, and get
1173 * thrown away once the IO for this page is complete. IO is
1174 * deemed complete once all buffers have been visited
1175 * (b_count==0) and are now unlocked. We must make sure that
1176 * only the _last_ buffer that decrements its count is the one
1177 * that free's the page..
1186 tmp
= tmp
->b_this_page
;
1187 } while (tmp
!= bh
);
1189 /* OK, the async IO on this page is complete. */
1190 free_async_buffers(bh
);
1191 restore_flags(flags
);
1192 clear_bit(PG_locked
, &page
->flags
);
1193 wake_up(&page
->wait
);
1194 after_unlock_page(page
);
1198 restore_flags(flags
);
1202 printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1206 printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1211 * Start I/O on a page.
1212 * This function expects the page to be locked and may return before I/O is complete.
1213 * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1215 int brw_page(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
, int bmap
)
1217 struct buffer_head
*bh
, *prev
, *next
, *arr
[MAX_BUF_PER_PAGE
];
1220 if (!PageLocked(page
))
1221 panic("brw_page: page not locked for I/O");
1222 clear_bit(PG_uptodate
, &page
->flags
);
1223 clear_bit(PG_error
, &page
->flags
);
1225 * Allocate async buffer heads pointing to this page, just for I/O.
1226 * They do _not_ show up in the buffer hash table!
1227 * They are _not_ registered in page->buffers either!
1229 bh
= create_buffers(page_address(page
), size
, 1);
1231 /* WSH: exit here leaves page->count incremented */
1232 clear_bit(PG_locked
, &page
->flags
);
1233 wake_up(&page
->wait
);
1239 struct buffer_head
* tmp
;
1242 init_buffer(next
, dev
, block
, end_buffer_io_async
, NULL
);
1243 set_bit(BH_Uptodate
, &next
->b_state
);
1246 * When we use bmap, we define block zero to represent
1247 * a hole. ll_rw_page, however, may legitimately
1248 * access block zero, and we need to distinguish the
1251 if (bmap
&& !block
) {
1252 memset(next
->b_data
, 0, size
);
1256 tmp
= get_hash_table(dev
, block
, size
);
1258 if (!buffer_uptodate(tmp
)) {
1260 ll_rw_block(READ
, 1, &tmp
);
1261 wait_on_buffer(tmp
);
1264 memcpy(next
->b_data
, tmp
->b_data
, size
);
1266 memcpy(tmp
->b_data
, next
->b_data
, size
);
1267 mark_buffer_dirty(tmp
, 0);
1274 clear_bit(BH_Uptodate
, &next
->b_state
);
1276 set_bit(BH_Dirty
, &next
->b_state
);
1278 } while (prev
= next
, (next
= next
->b_this_page
) != NULL
);
1279 prev
->b_this_page
= bh
;
1282 ll_rw_block(rw
, nr
, arr
);
1283 /* The rest of the work is done in mark_buffer_uptodate()
1284 * and unlock_buffer(). */
1286 unsigned long flags
;
1287 clear_bit(PG_locked
, &page
->flags
);
1288 set_bit(PG_uptodate
, &page
->flags
);
1289 wake_up(&page
->wait
);
1292 free_async_buffers(bh
);
1293 restore_flags(flags
);
1294 after_unlock_page(page
);
1301 * This is called by end_request() when I/O has completed.
1303 void mark_buffer_uptodate(struct buffer_head
* bh
, int on
)
1306 struct buffer_head
*tmp
= bh
;
1307 set_bit(BH_Uptodate
, &bh
->b_state
);
1308 /* If a page has buffers and all these buffers are uptodate,
1309 * then the page is uptodate. */
1311 if (!test_bit(BH_Uptodate
, &tmp
->b_state
))
1313 tmp
=tmp
->b_this_page
;
1314 } while (tmp
&& tmp
!= bh
);
1315 set_bit(PG_uptodate
, &mem_map
[MAP_NR(bh
->b_data
)].flags
);
1318 clear_bit(BH_Uptodate
, &bh
->b_state
);
1322 * Generic "readpage" function for block devices that have the normal
1323 * bmap functionality. This is most of the block device filesystems.
1324 * Reads the page asynchronously --- the unlock_buffer() and
1325 * mark_buffer_uptodate() functions propagate buffer state into the
1326 * page struct once IO has completed.
1328 int generic_readpage(struct file
* file
, struct page
* page
)
1330 struct dentry
*dentry
= file
->f_dentry
;
1331 struct inode
*inode
= dentry
->d_inode
;
1332 unsigned long block
;
1333 int *p
, nr
[PAGE_SIZE
/512];
1336 atomic_inc(&page
->count
);
1337 set_bit(PG_locked
, &page
->flags
);
1338 set_bit(PG_free_after
, &page
->flags
);
1340 i
= PAGE_SIZE
>> inode
->i_sb
->s_blocksize_bits
;
1341 block
= page
->offset
>> inode
->i_sb
->s_blocksize_bits
;
1344 *p
= inode
->i_op
->bmap(inode
, block
);
1351 brw_page(READ
, page
, inode
->i_dev
, nr
, inode
->i_sb
->s_blocksize
, 1);
1356 * Try to increase the number of buffers available: the size argument
1357 * is used to determine what kind of buffers we want.
1359 static int grow_buffers(int size
)
1362 struct buffer_head
*bh
, *tmp
;
1363 struct buffer_head
* insert_point
;
1366 if ((size
& 511) || (size
> PAGE_SIZE
)) {
1367 printk("VFS: grow_buffers: size = %d\n",size
);
1371 if (!(page
= __get_free_page(GFP_BUFFER
)))
1373 bh
= create_buffers(page
, size
, 0);
1379 isize
= BUFSIZE_INDEX(size
);
1380 insert_point
= free_list
[isize
];
1385 tmp
->b_next_free
= insert_point
->b_next_free
;
1386 tmp
->b_prev_free
= insert_point
;
1387 insert_point
->b_next_free
->b_prev_free
= tmp
;
1388 insert_point
->b_next_free
= tmp
;
1390 tmp
->b_prev_free
= tmp
;
1391 tmp
->b_next_free
= tmp
;
1395 if (tmp
->b_this_page
)
1396 tmp
= tmp
->b_this_page
;
1400 tmp
->b_this_page
= bh
;
1401 free_list
[isize
] = bh
;
1402 mem_map
[MAP_NR(page
)].buffers
= bh
;
1403 buffermem
+= PAGE_SIZE
;
1408 * Can the buffer be thrown out?
1410 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1411 #define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1414 * try_to_free_buffers() checks if all the buffers on this particular page
1415 * are unused, and free's the page if so.
1417 * Wake up bdflush() if this fails - if we're running low on memory due
1418 * to dirty buffers, we need to flush them out as quickly as possible.
1420 int try_to_free_buffers(struct page
* page_map
)
1422 struct buffer_head
* tmp
, * bh
= page_map
->buffers
;
1426 struct buffer_head
* p
= tmp
;
1428 tmp
= tmp
->b_this_page
;
1429 if (!buffer_busy(p
))
1434 } while (tmp
!= bh
);
1438 struct buffer_head
* p
= tmp
;
1439 tmp
= tmp
->b_this_page
;
1441 remove_from_queues(p
);
1442 put_unused_buffer_head(p
);
1443 } while (tmp
!= bh
);
1445 /* Wake up anyone waiting for buffer heads */
1446 wake_up(&buffer_wait
);
1448 /* And free the page */
1449 buffermem
-= PAGE_SIZE
;
1450 page_map
->buffers
= NULL
;
1451 __free_page(page_map
);
1455 /* ================== Debugging =================== */
1457 void show_buffers(void)
1459 struct buffer_head
* bh
;
1460 int found
= 0, locked
= 0, dirty
= 0, used
= 0, lastused
= 0;
1463 static char *buf_types
[NR_LIST
] = {"CLEAN","LOCKED","DIRTY"};
1465 printk("Buffer memory: %6dkB\n",buffermem
>>10);
1466 printk("Buffer heads: %6d\n",nr_buffer_heads
);
1467 printk("Buffer blocks: %6d\n",nr_buffers
);
1468 printk("Buffer hashed: %6d\n",nr_hashed_buffers
);
1470 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
1471 found
= locked
= dirty
= used
= lastused
= protected = 0;
1472 bh
= lru_list
[nlist
];
1477 if (buffer_locked(bh
))
1479 if (buffer_protected(bh
))
1481 if (buffer_dirty(bh
))
1484 used
++, lastused
= found
;
1485 bh
= bh
->b_next_free
;
1486 } while (bh
!= lru_list
[nlist
]);
1487 printk("%8s: %d buffers, %d used (last=%d), "
1488 "%d locked, %d protected, %d dirty\n",
1489 buf_types
[nlist
], found
, used
, lastused
,
1490 locked
, protected, dirty
);
1495 /* ===================== Init ======================= */
1498 * allocate the hash table and init the free list
1499 * Use gfp() for the hash table to decrease TLB misses, use
1500 * SLAB cache for buffer heads.
1502 void __init
buffer_init(unsigned long memory_size
)
1505 unsigned int nr_hash
;
1507 /* we need to guess at the right sort of size for a buffer cache.
1508 the heuristic from working with large databases and getting
1509 fsync times (ext2) manageable, is the following */
1512 for (order
= 5; (1UL << order
) < memory_size
; order
++);
1514 /* try to allocate something until we get it or we're asking
1515 for something that is really too small */
1518 nr_hash
= (1UL << order
) * PAGE_SIZE
/
1519 sizeof(struct buffer_head
*);
1520 hash_table
= (struct buffer_head
**)
1521 __get_free_pages(GFP_ATOMIC
, order
);
1522 } while (hash_table
== NULL
&& --order
> 4);
1525 panic("Failed to allocate buffer hash table\n");
1526 memset(hash_table
, 0, nr_hash
* sizeof(struct buffer_head
*));
1527 bh_hash_mask
= nr_hash
-1;
1529 bh_cachep
= kmem_cache_create("buffer_head",
1530 sizeof(struct buffer_head
),
1532 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
1534 panic("Cannot create buffer head SLAB cache\n");
1536 * Allocate the reserved buffer heads.
1538 while (nr_buffer_heads
< NR_RESERVED
) {
1539 struct buffer_head
* bh
;
1541 bh
= kmem_cache_alloc(bh_cachep
, SLAB_ATOMIC
);
1544 put_unused_buffer_head(bh
);
1548 lru_list
[BUF_CLEAN
] = 0;
1549 grow_buffers(BLOCK_SIZE
);
1553 /* ====================== bdflush support =================== */
1555 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1556 * response to dirty buffers. Once this process is activated, we write back
1557 * a limited number of buffers to the disks and then go back to sleep again.
1559 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done
);
1560 struct task_struct
*bdflush_tsk
= 0;
1562 void wakeup_bdflush(int wait
)
1564 if (current
== bdflush_tsk
)
1566 wake_up_process(bdflush_tsk
);
1568 run_task_queue(&tq_disk
);
1569 sleep_on(&bdflush_done
);
1575 * Here we attempt to write back old buffers.
1576 * To prevent deadlocks for a loop device:
1577 * 1) Do non-blocking writes to loop (avoids deadlock with running
1578 * out of request blocks).
1579 * 2) But do a blocking write if the only dirty buffers are loop buffers
1580 * (otherwise we go into an infinite busy-loop).
1581 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1582 * with running out of free buffers for loop's "real" device).
1585 static inline void sync_old_buffers(void)
1589 int wrta_cmd
= WRITEA
;
1591 int ncount
= 0, nwritten
= 0;
1593 struct buffer_head
* bh
, *next
;
1596 bh
= lru_list
[BUF_CLEAN
];
1598 for(i
= nr_buffers_type
[BUF_CLEAN
]; --i
> 0; bh
= next
) {
1599 next
= bh
->b_next_free
;
1601 /* Dirty/locked buffer on clean list? Refile it */
1602 if (buffer_locked(bh
) || buffer_dirty(bh
)) {
1609 bh
= lru_list
[BUF_LOCKED
];
1611 for(i
= nr_buffers_type
[BUF_LOCKED
]; --i
> 0; bh
= next
) {
1612 next
= bh
->b_next_free
;
1614 /* Unlocked buffer on locked list? Refile it */
1615 if (!buffer_locked(bh
))
1620 bh
= lru_list
[BUF_DIRTY
];
1622 for (i
= nr_buffers_type
[BUF_DIRTY
];
1623 i
-- > 0 && ndirty
< bdf_prm
.b_un
.ndirty
;
1625 /* We may have stalled while waiting for
1627 if(bh
->b_list
!= BUF_DIRTY
)
1629 next
= bh
->b_next_free
;
1630 if(!lru_list
[BUF_DIRTY
]) {
1631 printk("Dirty list empty %d\n", i
);
1635 /* Clean buffer on dirty list? Refile it */
1636 if (!buffer_dirty(bh
)) {
1641 if (buffer_locked(bh
))
1643 /* Should we write back buffers that are
1644 shared or not?? Currently dirty buffers
1645 are not shared, so it does not matter */
1649 bh
->b_flushtime
= 0;
1650 if (MAJOR(bh
->b_dev
) == LOOP_MAJOR
) {
1651 ll_rw_block(wrta_cmd
,1, &bh
);
1653 if (buffer_dirty(bh
))
1657 ll_rw_block(WRITE
, 1, &bh
);
1661 /* If we didn't write anything, but there are still
1662 * dirty buffers, then make the next write to a
1663 * loop device to be a blocking write.
1664 * This lets us block--which we _must_ do! */
1666 && nr_buffers_type
[BUF_DIRTY
] > 0 && wrta_cmd
!= WRITE
) {
1672 if (ncount
) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount
);
1673 printk("wrote %d/%d buffers...", nwritten
, ndirty
);
1675 run_task_queue(&tq_disk
);
1679 /* This is the interface to bdflush. As we get more sophisticated, we can
1680 * pass tuning parameters to this "process", to adjust how it behaves.
1681 * We would want to verify each parameter, however, to make sure that it
1684 asmlinkage
int sys_bdflush(int func
, long data
)
1686 int i
, error
= -EPERM
;
1689 if (!capable(CAP_SYS_ADMIN
))
1693 /* Func 1 used to call sync_old_buffers; a user space
1694 daemon would call it periodically. This is no
1695 longer necessary. Returning -EPERM here makes the
1696 daemon silently exit. */
1699 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1703 if (i
< 0 || i
>= N_PARAM
)
1705 if((func
& 1) == 0) {
1706 error
= put_user(bdf_prm
.data
[i
], (int*)data
);
1709 if (data
< bdflush_min
[i
] || data
> bdflush_max
[i
])
1711 bdf_prm
.data
[i
] = data
;
1716 /* Having func 0 used to launch the actual bdflush and then never
1717 * return (unless explicitly killed). We return zero here to
1718 * remain semi-compatible with present update(8) programs.
1726 /* This is the actual bdflush daemon itself. It used to be started
1727 * from the syscall above, but now we launch it ourselves internally
1728 * with kernel_thread(...) directly after the first thread in
1729 * init/main.c. Every so often, or when woken up by another task that
1730 * needs memory, we call sync_old_buffers to partially clear the dirty list.
1733 int bdflush(void * unused
)
1735 long remaining
= HZ
* bdf_prm
.b_un
.interval
;
1736 struct task_struct
*tsk
= current
;
1739 * We have a bare-bones task_struct, and really should fill
1740 * in a few more things so "top" and /proc/2/{exe,root,cwd}
1741 * display semi-sane things. Not real crucial though...
1746 tsk
->dumpable
= 0; /* inhibit ptrace() */
1747 strcpy(tsk
->comm
, "kflushd");
1748 sigfillset(&tsk
->blocked
);
1752 * As a kernel thread we want to tamper with system buffers
1753 * and other internals and thus be subject to the SMP locking
1754 * rules. (On a uniprocessor box this does nothing).
1759 tsk
->state
= TASK_INTERRUPTIBLE
;
1760 remaining
= schedule_timeout(remaining
);
1763 printk("bdflush() activated...");
1765 CHECK_EMERGENCY_SYNC
1767 if (remaining
== 0) {
1769 * Also try to flush inodes and supers, since
1770 * otherwise there would be no way of ensuring
1771 * that these quantities ever get written
1772 * back. Ideally, we would have a timestamp
1773 * on the inodes and superblocks so that we
1774 * could write back only the old ones.
1778 remaining
= HZ
* bdf_prm
.b_un
.interval
;
1781 /* Keep flushing till there aren't very many dirty buffers */
1784 } while(nr_buffers_type
[BUF_DIRTY
] > nr_buffers
* bdf_prm
.b_un
.nfract
/100);
1786 wake_up(&bdflush_done
);
1788 printk("sleeping again.\n");