4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
27 #include <linux/malloc.h>
28 #include <linux/locks.h>
29 #include <linux/errno.h>
30 #include <linux/swap.h>
31 #include <linux/swapctl.h>
32 #include <linux/smp_lock.h>
33 #include <linux/vmalloc.h>
34 #include <linux/blkdev.h>
35 #include <linux/sysrq.h>
36 #include <linux/file.h>
37 #include <linux/init.h>
38 #include <linux/quotaops.h>
40 #include <asm/uaccess.h>
42 #include <asm/bitops.h>
45 static char buffersize_index
[65] =
46 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
47 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
48 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
49 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
56 number of unused buffer heads */
61 static unsigned long bh_hash_mask
= 0;
63 static int grow_buffers(int size
);
65 static struct buffer_head
** hash_table
;
66 static struct buffer_head
* lru_list
[NR_LIST
] = {NULL
, };
67 static struct buffer_head
* free_list
[NR_SIZES
] = {NULL
, };
69 static kmem_cache_t
*bh_cachep
;
71 static struct buffer_head
* unused_list
= NULL
;
72 static struct buffer_head
* reuse_list
= NULL
;
73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait
);
75 static int nr_buffers
= 0;
76 static int nr_buffers_type
[NR_LIST
] = {0,};
77 static int nr_buffer_heads
= 0;
78 static int nr_unused_buffer_heads
= 0;
79 static int nr_hashed_buffers
= 0;
81 /* This is used by some architectures to estimate available memory. */
84 /* Here is the parameter block for the bdflush process. If you add or
85 * remove any of the parameters, make sure to update kernel/sysctl.c.
90 /* The dummy values in this structure are left in there for compatibility
91 * with old programs that play with the /proc entries.
95 int nfract
; /* Percentage of buffer cache dirty to
97 int ndirty
; /* Maximum number of dirty blocks to write out per
99 int nrefill
; /* Number of clean buffers to try to obtain
100 each time we call refill */
101 int nref_dirt
; /* Dirty buffer threshold for activating bdflush
102 when trying to refill buffers. */
103 int dummy1
; /* unused */
104 int age_buffer
; /* Time for normal buffer to age before
106 int age_super
; /* Time for superblock to age before we
108 int dummy2
; /* unused */
109 int dummy3
; /* unused */
111 unsigned int data
[N_PARAM
];
112 } bdf_prm
= {{40, 500, 64, 256, 15, 30*HZ
, 5*HZ
, 1884, 2}};
114 /* These are the min and max parameter values that we will allow to be assigned */
115 int bdflush_min
[N_PARAM
] = { 0, 10, 5, 25, 0, 1*HZ
, 1*HZ
, 1, 1};
116 int bdflush_max
[N_PARAM
] = {100,50000, 20000, 20000,1000, 6000*HZ
, 6000*HZ
, 2047, 5};
118 void wakeup_bdflush(int);
121 * Rewrote the wait-routines to use the "new" wait-queue functionality,
122 * and getting rid of the cli-sti pairs. The wait-queue routines still
123 * need cli-sti, but now it's just a couple of 386 instructions or so.
125 * Note that the real wait_on_buffer() is an inline function that checks
126 * if 'b_wait' is set before calling this, so that the queues aren't set
129 void __wait_on_buffer(struct buffer_head
* bh
)
131 struct task_struct
*tsk
= current
;
132 DECLARE_WAITQUEUE(wait
, tsk
);
135 add_wait_queue(&bh
->b_wait
, &wait
);
137 tsk
->state
= TASK_UNINTERRUPTIBLE
;
138 run_task_queue(&tq_disk
);
139 if (buffer_locked(bh
)) {
143 tsk
->state
= TASK_RUNNING
;
144 remove_wait_queue(&bh
->b_wait
, &wait
);
148 /* Call sync_buffers with wait!=0 to ensure that the call does not
149 * return until all buffer writes have completed. Sync() may return
150 * before the writes have finished; fsync() may not.
153 /* Godamity-damn. Some buffers (bitmaps for filesystems)
154 * spontaneously dirty themselves without ever brelse being called.
155 * We will ultimately want to put these in a separate list, but for
156 * now we search all of the lists for dirty buffers.
158 static int sync_buffers(kdev_t dev
, int wait
)
160 int i
, retry
, pass
= 0, err
= 0;
161 struct buffer_head
* bh
, *next
;
163 /* One pass for no-wait, three for wait:
164 * 0) write out all dirty, unlocked buffers;
165 * 1) write out all dirty buffers, waiting if locked;
166 * 2) wait for completion by waiting for all buffers to unlock.
171 /* We search all lists as a failsafe mechanism, not because we expect
172 * there to be dirty buffers on any of the other lists.
174 bh
= lru_list
[BUF_DIRTY
];
177 for (i
= nr_buffers_type
[BUF_DIRTY
]*2 ; i
-- > 0 ; bh
= next
) {
178 if (bh
->b_list
!= BUF_DIRTY
)
180 next
= bh
->b_next_free
;
181 if (!lru_list
[BUF_DIRTY
])
183 if (dev
&& bh
->b_dev
!= dev
)
185 if (buffer_locked(bh
)) {
186 /* Buffer is locked; skip it unless wait is
187 * requested AND pass > 0.
189 if (!wait
|| !pass
) {
197 /* If an unlocked buffer is not uptodate, there has
198 * been an IO error. Skip it.
200 if (wait
&& buffer_req(bh
) && !buffer_locked(bh
) &&
201 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) {
206 /* Don't write clean buffers. Don't write ANY buffers
209 if (!buffer_dirty(bh
) || pass
>= 2)
212 /* Don't bother about locked buffers.
214 * XXX We checked if it was locked above and there is no
215 * XXX way we could have slept in between. -DaveM
217 if (buffer_locked(bh
))
222 ll_rw_block(WRITE
, 1, &bh
);
229 bh
= lru_list
[BUF_LOCKED
];
232 for (i
= nr_buffers_type
[BUF_LOCKED
]*2 ; i
-- > 0 ; bh
= next
) {
233 if (bh
->b_list
!= BUF_LOCKED
)
235 next
= bh
->b_next_free
;
236 if (!lru_list
[BUF_LOCKED
])
238 if (dev
&& bh
->b_dev
!= dev
)
240 if (buffer_locked(bh
)) {
241 /* Buffer is locked; skip it unless wait is
242 * requested AND pass > 0.
244 if (!wait
|| !pass
) {
253 /* If we are waiting for the sync to succeed, and if any dirty
254 * blocks were written, then repeat; on the second pass, only
255 * wait for buffers being written (do not pass to write any
256 * more buffers on the second pass).
258 } while (wait
&& retry
&& ++pass
<=2);
262 void sync_dev(kdev_t dev
)
264 sync_buffers(dev
, 0);
267 sync_buffers(dev
, 0);
270 * FIXME(eric) we need to sync the physical devices here.
271 * This is because some (scsi) controllers have huge amounts of
272 * cache onboard (hundreds of Mb), and we need to instruct
273 * them to commit all of the dirty memory to disk, and we should
274 * not return until this has happened.
276 * This would need to get implemented by going through the assorted
277 * layers so that each block major number can be synced, and this
278 * would call down into the upper and mid-layer scsi.
282 int fsync_dev(kdev_t dev
)
284 sync_buffers(dev
, 0);
288 return sync_buffers(dev
, 1);
291 asmlinkage
int sys_sync(void)
300 * filp may be NULL if called via the msync of a vma.
303 int file_fsync(struct file
*filp
, struct dentry
*dentry
)
305 struct inode
* inode
= dentry
->d_inode
;
306 struct super_block
* sb
;
309 /* sync the inode to buffers */
310 write_inode_now(inode
);
312 /* sync the superblock to buffers */
315 if (sb
->s_op
&& sb
->s_op
->write_super
)
316 sb
->s_op
->write_super(sb
);
318 /* .. finally sync the buffers to disk */
320 return sync_buffers(dev
, 1);
323 asmlinkage
int sys_fsync(unsigned int fd
)
326 struct dentry
* dentry
;
327 struct inode
* inode
;
336 dentry
= file
->f_dentry
;
340 inode
= dentry
->d_inode
;
345 if (!file
->f_op
|| !file
->f_op
->fsync
)
348 /* We need to protect against concurrent writers.. */
350 err
= file
->f_op
->fsync(file
, dentry
);
360 asmlinkage
int sys_fdatasync(unsigned int fd
)
363 struct dentry
* dentry
;
364 struct inode
* inode
;
373 dentry
= file
->f_dentry
;
377 inode
= dentry
->d_inode
;
382 if (!file
->f_op
|| !file
->f_op
->fsync
)
385 /* this needs further work, at the moment it is identical to fsync() */
387 err
= file
->f_op
->fsync(file
, dentry
);
397 void invalidate_buffers(kdev_t dev
)
401 struct buffer_head
* bh
;
403 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
404 bh
= lru_list
[nlist
];
405 for (i
= nr_buffers_type
[nlist
]*2 ; --i
> 0 ; bh
= bh
->b_next_free
) {
406 if (bh
->b_dev
!= dev
)
409 if (bh
->b_dev
!= dev
)
414 clear_bit(BH_Protected
, &bh
->b_state
);
415 clear_bit(BH_Uptodate
, &bh
->b_state
);
416 clear_bit(BH_Dirty
, &bh
->b_state
);
417 clear_bit(BH_Req
, &bh
->b_state
);
422 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
423 #define hash(dev,block) hash_table[_hashfn(dev,block)]
425 static void insert_into_hash_list(struct buffer_head
* bh
)
430 struct buffer_head
**bhp
= &hash(bh
->b_dev
, bh
->b_blocknr
);
431 struct buffer_head
*next
= *bhp
;
435 next
->b_pprev
= &bh
->b_next
;
443 static void remove_from_hash_queue(struct buffer_head
* bh
)
445 struct buffer_head
**pprev
= bh
->b_pprev
;
447 struct buffer_head
* next
= bh
->b_next
;
449 next
->b_pprev
= pprev
;
458 static void insert_into_lru_list(struct buffer_head
* bh
)
460 struct buffer_head
**bhp
= &lru_list
[bh
->b_list
];
462 if (bh
->b_dev
== B_FREE
)
467 bh
->b_prev_free
= bh
;
471 panic("VFS: buffer LRU pointers corrupted");
473 bh
->b_next_free
= *bhp
;
474 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
475 (*bhp
)->b_prev_free
->b_next_free
= bh
;
476 (*bhp
)->b_prev_free
= bh
;
479 nr_buffers_type
[bh
->b_list
]++;
482 static void remove_from_lru_list(struct buffer_head
* bh
)
484 if (!(bh
->b_prev_free
) || !(bh
->b_next_free
))
487 if (bh
->b_dev
== B_FREE
) {
488 printk("LRU list corrupted");
491 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
492 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
494 if (lru_list
[bh
->b_list
] == bh
)
495 lru_list
[bh
->b_list
] = bh
->b_next_free
;
496 if (lru_list
[bh
->b_list
] == bh
)
497 lru_list
[bh
->b_list
] = NULL
;
498 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
501 nr_buffers_type
[bh
->b_list
]--;
504 static void remove_from_free_list(struct buffer_head
* bh
)
506 int isize
= BUFSIZE_INDEX(bh
->b_size
);
507 if (!(bh
->b_prev_free
) || !(bh
->b_next_free
))
508 panic("VFS: Free block list corrupted");
509 if(bh
->b_dev
!= B_FREE
)
510 panic("Free list corrupted");
511 if(!free_list
[isize
])
512 panic("Free list empty");
513 if(bh
->b_next_free
== bh
)
514 free_list
[isize
] = NULL
;
516 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
517 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
518 if (free_list
[isize
] == bh
)
519 free_list
[isize
] = bh
->b_next_free
;
521 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
524 static void remove_from_queues(struct buffer_head
* bh
)
526 if (bh
->b_dev
== B_FREE
)
528 remove_from_hash_queue(bh
);
529 remove_from_lru_list(bh
);
532 static void put_last_free(struct buffer_head
* bh
)
535 struct buffer_head
**bhp
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)];
540 bh
->b_dev
= B_FREE
; /* So it is obvious we are on the free list. */
542 /* Add to back of free list. */
545 bh
->b_prev_free
= bh
;
548 bh
->b_next_free
= *bhp
;
549 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
550 (*bhp
)->b_prev_free
->b_next_free
= bh
;
551 (*bhp
)->b_prev_free
= bh
;
555 struct buffer_head
* find_buffer(kdev_t dev
, int block
, int size
)
557 struct buffer_head
* next
;
559 next
= hash(dev
,block
);
561 struct buffer_head
*tmp
= next
;
565 if (tmp
->b_blocknr
!= block
|| tmp
->b_size
!= size
|| tmp
->b_dev
!= dev
)
574 * Why like this, I hear you say... The reason is race-conditions.
575 * As we don't lock buffers (unless we are reading them, that is),
576 * something might happen to it while we sleep (ie a read-error
577 * will force it bad). This shouldn't really happen currently, but
580 struct buffer_head
* get_hash_table(kdev_t dev
, int block
, int size
)
582 struct buffer_head
* bh
;
583 bh
= find_buffer(dev
,block
,size
);
589 unsigned int get_hardblocksize(kdev_t dev
)
592 * Get the hard sector size for the given device. If we don't know
593 * what it is, return 0.
595 if (hardsect_size
[MAJOR(dev
)] != NULL
) {
596 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)];
602 * We don't know what the hardware sector size for this device is.
603 * Return 0 indicating that we don't know.
608 void set_blocksize(kdev_t dev
, int size
)
610 extern int *blksize_size
[];
612 struct buffer_head
* bh
, *bhnext
;
614 if (!blksize_size
[MAJOR(dev
)])
617 /* Size must be a power of two, and between 512 and PAGE_SIZE */
618 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
619 panic("Invalid blocksize passed to set_blocksize");
621 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == 0 && size
== BLOCK_SIZE
) {
622 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
625 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
)
627 sync_buffers(dev
, 2);
628 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
630 /* We need to be quite careful how we do this - we are moving entries
631 * around on the free list, and we can get in a loop if we are not careful.
633 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
634 bh
= lru_list
[nlist
];
635 for (i
= nr_buffers_type
[nlist
]*2 ; --i
> 0 ; bh
= bhnext
) {
639 bhnext
= bh
->b_next_free
;
640 if (bh
->b_dev
!= dev
)
642 if (bh
->b_size
== size
)
648 if (bh
->b_dev
== dev
&& bh
->b_size
!= size
) {
649 clear_bit(BH_Dirty
, &bh
->b_state
);
650 clear_bit(BH_Uptodate
, &bh
->b_state
);
651 clear_bit(BH_Req
, &bh
->b_state
);
656 remove_from_queues(bh
);
663 * We used to try various strange things. Let's not.
665 static void refill_freelist(int size
)
667 if (!grow_buffers(size
)) {
669 current
->policy
|= SCHED_YIELD
;
674 void init_buffer(struct buffer_head
*bh
, kdev_t dev
, int block
,
675 bh_end_io_t
*handler
, void *dev_id
)
677 bh
->b_list
= BUF_CLEAN
;
680 bh
->b_blocknr
= block
;
681 bh
->b_end_io
= handler
;
682 bh
->b_dev_id
= dev_id
;
685 static void end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
687 mark_buffer_uptodate(bh
, uptodate
);
692 * Ok, this is getblk, and it isn't very clear, again to hinder
693 * race-conditions. Most of the code is seldom used, (ie repeating),
694 * so it should be much more efficient than it looks.
696 * The algorithm is changed: hopefully better, and an elusive bug removed.
698 * 14.02.92: changed it to sync dirty buffers a bit: better performance
699 * when the filesystem starts to get full of dirty blocks (I hope).
701 struct buffer_head
* getblk(kdev_t dev
, int block
, int size
)
703 struct buffer_head
* bh
;
707 bh
= get_hash_table(dev
, block
, size
);
709 if (!buffer_dirty(bh
)) {
715 isize
= BUFSIZE_INDEX(size
);
717 bh
= free_list
[isize
];
720 remove_from_free_list(bh
);
722 /* OK, FINALLY we know that this buffer is the only one of its kind,
723 * and that it's unused (b_count=0), unlocked, and clean.
725 init_buffer(bh
, dev
, block
, end_buffer_io_sync
, NULL
);
729 /* Insert the buffer into the regular lists */
730 insert_into_lru_list(bh
);
731 insert_into_hash_list(bh
);
735 * If we block while refilling the free list, somebody may
736 * create the buffer first ... search the hashes again.
739 refill_freelist(size
);
740 if (!find_buffer(dev
,block
,size
))
747 void set_writetime(struct buffer_head
* buf
, int flag
)
751 if (buffer_dirty(buf
)) {
752 /* Move buffer to dirty list if jiffies is clear. */
753 newtime
= jiffies
+ (flag
? bdf_prm
.b_un
.age_super
:
754 bdf_prm
.b_un
.age_buffer
);
755 if(!buf
->b_flushtime
|| buf
->b_flushtime
> newtime
)
756 buf
->b_flushtime
= newtime
;
758 buf
->b_flushtime
= 0;
763 * Put a buffer into the appropriate list, without side-effects.
765 static void file_buffer(struct buffer_head
*bh
, int list
)
767 remove_from_lru_list(bh
);
769 insert_into_lru_list(bh
);
773 * if a new dirty buffer is created we need to balance bdflush.
775 static inline void balance_dirty (kdev_t dev
)
777 int too_many
= (nr_buffers
* bdf_prm
.b_un
.nfract
/100);
779 /* This buffer is dirty, maybe we need to start flushing.
780 * If too high a percentage of the buffers are dirty...
782 if (nr_buffers_type
[BUF_DIRTY
] > too_many
) {
786 /* If this is a loop device, and
787 * more than half of the buffers are dirty...
788 * (Prevents no-free-buffers deadlock with loop device.)
790 if (MAJOR(dev
) == LOOP_MAJOR
&&
791 nr_buffers_type
[BUF_DIRTY
]*2>nr_buffers
)
796 * A buffer may need to be moved from one buffer list to another
797 * (e.g. in case it is not shared any more). Handle this.
799 void refile_buffer(struct buffer_head
* buf
)
803 if(buf
->b_dev
== B_FREE
) {
804 printk("Attempt to refile free buffer\n");
807 if (buffer_dirty(buf
))
809 else if (buffer_locked(buf
))
810 dispose
= BUF_LOCKED
;
813 if(dispose
!= buf
->b_list
) {
814 file_buffer(buf
, dispose
);
815 if (dispose
== BUF_DIRTY
)
816 balance_dirty(buf
->b_dev
);
821 * Release a buffer head
823 void __brelse(struct buffer_head
* buf
)
825 /* If dirty, mark the time this buffer should be written back. */
826 set_writetime(buf
, 0);
832 wake_up(&buffer_wait
);
835 printk("VFS: brelse: Trying to free free buffer\n");
839 * bforget() is like brelse(), except it puts the buffer on the
840 * free list if it can.. We can NOT free the buffer if:
841 * - there are other users of it
842 * - it is locked and thus can have active IO
844 void __bforget(struct buffer_head
* buf
)
846 if (buf
->b_count
!= 1 || buffer_locked(buf
)) {
852 remove_from_queues(buf
);
857 * bread() reads a specified block and returns the buffer that contains
858 * it. It returns NULL if the block was unreadable.
860 struct buffer_head
* bread(kdev_t dev
, int block
, int size
)
862 struct buffer_head
* bh
;
864 bh
= getblk(dev
, block
, size
);
865 if (buffer_uptodate(bh
))
867 ll_rw_block(READ
, 1, &bh
);
869 if (buffer_uptodate(bh
))
876 * Ok, breada can be used as bread, but additionally to mark other
877 * blocks for reading as well. End the argument list with a negative
883 struct buffer_head
* breada(kdev_t dev
, int block
, int bufsize
,
884 unsigned int pos
, unsigned int filesize
)
886 struct buffer_head
* bhlist
[NBUF
];
888 struct buffer_head
* bh
;
898 bh
= getblk(dev
, block
, bufsize
);
899 index
= BUFSIZE_INDEX(bh
->b_size
);
901 if (buffer_uptodate(bh
))
903 else ll_rw_block(READ
, 1, &bh
);
905 blocks
= (filesize
- pos
) >> (9+index
);
907 if (blocks
< (read_ahead
[MAJOR(dev
)] >> index
))
908 blocks
= read_ahead
[MAJOR(dev
)] >> index
;
912 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
916 for(i
=1; i
<blocks
; i
++) {
917 bh
= getblk(dev
,block
+i
,bufsize
);
918 if (buffer_uptodate(bh
)) {
922 else bhlist
[j
++] = bh
;
925 /* Request the read for these buffers, and then release them. */
927 ll_rw_block(READA
, (j
-1), bhlist
+1);
931 /* Wait for this buffer, and then continue on. */
934 if (buffer_uptodate(bh
))
941 * Note: the caller should wake up the buffer_wait list if needed.
943 static void put_unused_buffer_head(struct buffer_head
* bh
)
945 if (nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) {
947 kmem_cache_free(bh_cachep
, bh
);
951 // memset(bh, 0, sizeof(*bh));
953 init_waitqueue_head(&bh
->b_wait
);
954 nr_unused_buffer_heads
++;
955 bh
->b_next_free
= unused_list
;
960 * We can't put completed temporary IO buffer_heads directly onto the
961 * unused_list when they become unlocked, since the device driver
962 * end_request routines still expect access to the buffer_head's
963 * fields after the final unlock. So, the device driver puts them on
964 * the reuse_list instead once IO completes, and we recover these to
965 * the unused_list here.
967 * Note that we don't do a wakeup here, but return a flag indicating
968 * whether we got any buffer heads. A task ready to sleep can check
969 * the returned value, and any tasks already sleeping will have been
970 * awakened when the buffer heads were added to the reuse list.
972 static inline int recover_reusable_buffer_heads(void)
974 struct buffer_head
*head
= xchg(&reuse_list
, NULL
);
979 struct buffer_head
*bh
= head
;
980 head
= head
->b_next_free
;
981 put_unused_buffer_head(bh
);
989 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
990 * no-buffer-head deadlock. Return NULL on failure; waiting for
991 * buffer heads is now handled in create_buffers().
993 static struct buffer_head
* get_unused_buffer_head(int async
)
995 struct buffer_head
* bh
;
997 recover_reusable_buffer_heads();
998 if (nr_unused_buffer_heads
> NR_RESERVED
) {
1000 unused_list
= bh
->b_next_free
;
1001 nr_unused_buffer_heads
--;
1005 /* This is critical. We can't swap out pages to get
1006 * more buffer heads, because the swap-out may need
1007 * more buffer-heads itself. Thus SLAB_BUFFER.
1009 if((bh
= kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) {
1010 memset(bh
, 0, sizeof(*bh
));
1011 init_waitqueue_head(&bh
->b_wait
);
1017 * If we need an async buffer, use the reserved buffer heads.
1019 if (async
&& unused_list
) {
1021 unused_list
= bh
->b_next_free
;
1022 nr_unused_buffer_heads
--;
1028 * (Pending further analysis ...)
1029 * Ordinary (non-async) requests can use a different memory priority
1030 * to free up pages. Any swapping thus generated will use async
1034 (bh
= kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) {
1035 memset(bh
, 0, sizeof(*bh
));
1036 init_waitqueue_head(&bh
->b_wait
);
1046 * Create the appropriate buffers when given a page for data area and
1047 * the size of each buffer.. Use the bh->b_this_page linked list to
1048 * follow the buffers created. Return NULL if unable to create more
1050 * The async flag is used to differentiate async IO (paging, swapping)
1051 * from ordinary buffer allocations, and only async requests are allowed
1052 * to sleep waiting for buffer heads.
1054 static struct buffer_head
* create_buffers(unsigned long page
,
1055 unsigned long size
, int async
)
1057 DECLARE_WAITQUEUE(wait
, current
);
1058 struct buffer_head
*bh
, *head
;
1064 while ((offset
-= size
) >= 0) {
1065 bh
= get_unused_buffer_head(async
);
1069 bh
->b_dev
= B_FREE
; /* Flag as unused */
1070 bh
->b_this_page
= head
;
1074 bh
->b_next_free
= NULL
;
1078 bh
->b_data
= (char *) (page
+offset
);
1083 * In case anything failed, we just free everything we got.
1089 head
= head
->b_this_page
;
1090 put_unused_buffer_head(bh
);
1093 /* Wake up any waiters ... */
1094 wake_up(&buffer_wait
);
1098 * Return failure for non-async IO requests. Async IO requests
1099 * are not allowed to fail, so we have to wait until buffer heads
1100 * become available. But we don't want tasks sleeping with
1101 * partially complete buffers, so all were released above.
1106 /* We're _really_ low on memory. Now we just
1107 * wait for old buffer heads to become free due to
1108 * finishing IO. Since this is an async request and
1109 * the reserve list is empty, we're sure there are
1110 * async buffer heads in use.
1112 run_task_queue(&tq_disk
);
1115 * Set our state for sleeping, then check again for buffer heads.
1116 * This ensures we won't miss a wake_up from an interrupt.
1118 add_wait_queue(&buffer_wait
, &wait
);
1119 current
->state
= TASK_UNINTERRUPTIBLE
;
1120 if (!recover_reusable_buffer_heads())
1122 remove_wait_queue(&buffer_wait
, &wait
);
1123 current
->state
= TASK_RUNNING
;
1127 /* Run the hooks that have to be done when a page I/O has completed. */
1128 static inline void after_unlock_page (struct page
* page
)
1130 if (test_and_clear_bit(PG_decr_after
, &page
->flags
)) {
1131 atomic_dec(&nr_async_pages
);
1133 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1134 (char *) page_address(page
),
1135 atomic_read(&nr_async_pages
));
1138 if (test_and_clear_bit(PG_swap_unlock_after
, &page
->flags
))
1139 swap_after_unlock_page(page
->offset
);
1140 if (test_and_clear_bit(PG_free_after
, &page
->flags
))
1145 * Free all temporary buffers belonging to a page.
1146 * This needs to be called with interrupts disabled.
1148 static inline void free_async_buffers (struct buffer_head
* bh
)
1150 struct buffer_head
*tmp
, *tail
;
1153 * Link all the buffers into the b_next_free list,
1154 * so we only have to do one xchg() operation ...
1157 while ((tmp
= tail
->b_this_page
) != bh
) {
1158 tail
->b_next_free
= tmp
;
1162 /* Update the reuse list */
1163 tail
->b_next_free
= xchg(&reuse_list
, NULL
);
1166 /* Wake up any waiters ... */
1167 wake_up(&buffer_wait
);
1170 static void end_buffer_io_async(struct buffer_head
* bh
, int uptodate
)
1172 unsigned long flags
;
1173 struct buffer_head
*tmp
;
1176 mark_buffer_uptodate(bh
, uptodate
);
1178 /* This is a temporary buffer used for page I/O. */
1179 page
= mem_map
+ MAP_NR(bh
->b_data
);
1185 * Be _very_ careful from here on. Bad things can happen if
1186 * two buffer heads end IO at almost the same time and both
1187 * decide that the page is now completely done.
1189 * Async buffer_heads are here only as labels for IO, and get
1190 * thrown away once the IO for this page is complete. IO is
1191 * deemed complete once all buffers have been visited
1192 * (b_count==0) and are now unlocked. We must make sure that
1193 * only the _last_ buffer that decrements its count is the one
1194 * that free's the page..
1199 tmp
= bh
->b_this_page
;
1201 if (buffer_locked(tmp
))
1203 tmp
= tmp
->b_this_page
;
1206 /* OK, the async IO on this page is complete. */
1207 restore_flags(flags
);
1209 after_unlock_page(page
);
1211 * if none of the buffers had errors then we can set the
1214 if (!PageError(page
))
1215 SetPageUptodate(page
);
1216 if (page
->owner
!= -1)
1218 page
->owner
= (int)current
;
1224 restore_flags(flags
);
1228 static int create_page_buffers (int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
, int bmap
)
1230 struct buffer_head
*head
, *bh
, *tail
;
1233 if (!PageLocked(page
))
1235 if (page
->owner
!= (int)current
)
1238 * Allocate async buffer heads pointing to this page, just for I/O.
1239 * They show up in the buffer hash table and are registered in
1242 head
= create_buffers(page_address(page
), size
, 1);
1248 for (bh
= head
; bh
; bh
= bh
->b_this_page
) {
1252 init_buffer(bh
, dev
, block
, end_buffer_io_async
, NULL
);
1255 * When we use bmap, we define block zero to represent
1256 * a hole. ll_rw_page, however, may legitimately
1257 * access block zero, and we need to distinguish the
1260 if (bmap
&& !block
) {
1261 set_bit(BH_Uptodate
, &bh
->b_state
);
1262 memset(bh
->b_data
, 0, size
);
1265 tail
->b_this_page
= head
;
1267 page
->buffers
= head
;
1272 * We don't have to release all buffers here, but
1273 * we have to be sure that no dirty buffer is left
1274 * and no IO is going on (no buffer is locked), because
1275 * we have truncated the file and are going to free the
1278 int generic_block_flushpage(struct inode
*inode
, struct page
*page
, unsigned long offset
)
1280 struct buffer_head
*head
, *bh
, *next
;
1281 unsigned int curr_off
= 0;
1283 if (!PageLocked(page
))
1288 head
= page
->buffers
;
1291 unsigned int next_off
= curr_off
+ bh
->b_size
;
1292 next
= bh
->b_this_page
;
1295 * is this block fully flushed?
1297 if (offset
<= curr_off
) {
1298 if (bh
->b_blocknr
) {
1301 if (bh
->b_dev
== B_FREE
)
1303 mark_buffer_clean(bh
);
1308 curr_off
= next_off
;
1310 } while (bh
!= head
);
1313 * subtle. We release buffer-heads only if this is
1314 * the 'final' flushpage. We invalidate the bmap
1315 * cached value in all cases.
1318 try_to_free_buffers(page
);
1323 static inline void create_empty_buffers (struct page
*page
,
1324 struct inode
*inode
, unsigned long blocksize
)
1326 struct buffer_head
*bh
, *head
, *tail
;
1328 head
= create_buffers(page_address(page
), blocksize
, 1);
1334 bh
->b_dev
= inode
->i_dev
;
1337 bh
= bh
->b_this_page
;
1339 tail
->b_this_page
= head
;
1340 page
->buffers
= head
;
1344 int block_write_full_page (struct file
*file
, struct page
*page
, fs_getblock_t fs_get_block
)
1346 struct dentry
*dentry
= file
->f_dentry
;
1347 struct inode
*inode
= dentry
->d_inode
;
1348 int err
, created
, i
;
1349 unsigned long block
, phys
, offset
;
1350 struct buffer_head
*bh
, *head
;
1352 if (!PageLocked(page
))
1356 create_empty_buffers(page
, inode
, inode
->i_sb
->s_blocksize
);
1357 head
= page
->buffers
;
1359 offset
= page
->offset
;
1360 block
= offset
>> inode
->i_sb
->s_blocksize_bits
;
1362 // FIXME: currently we assume page alignment.
1363 if (offset
& (PAGE_SIZE
-1))
1372 if (!bh
->b_blocknr
) {
1374 down(&inode
->i_sem
);
1375 phys
= fs_get_block (inode
, block
, 1, &err
, &created
);
1380 init_buffer(bh
, inode
->i_dev
, phys
, end_buffer_io_sync
, NULL
);
1381 bh
->b_state
= (1<<BH_Uptodate
);
1384 * block already exists, just mark it dirty:
1386 bh
->b_end_io
= end_buffer_io_sync
;
1387 set_bit(BH_Uptodate
, &bh
->b_state
);
1389 mark_buffer_dirty(bh
, 0);
1391 bh
= bh
->b_this_page
;
1393 } while (bh
!= head
);
1395 SetPageUptodate(page
);
1398 ClearPageUptodate(page
);
1402 int block_write_one_page (struct file
*file
, struct page
*page
, unsigned long offset
, unsigned long bytes
, const char * buf
, fs_getblock_t fs_get_block
)
1404 struct dentry
*dentry
= file
->f_dentry
;
1405 struct inode
*inode
= dentry
->d_inode
;
1406 unsigned long block
;
1408 unsigned long blocksize
, start_block
, end_block
;
1409 unsigned long start_offset
, start_bytes
, end_bytes
;
1410 unsigned long bbits
, phys
, blocks
, i
, len
;
1411 struct buffer_head
*bh
, *head
;
1414 target_buf
= (char *)page_address(page
) + offset
;
1417 if (!PageLocked(page
))
1420 blocksize
= inode
->i_sb
->s_blocksize
;
1422 create_empty_buffers(page
, inode
, blocksize
);
1423 head
= page
->buffers
;
1425 bbits
= inode
->i_sb
->s_blocksize_bits
;
1426 block
= page
->offset
>> bbits
;
1427 blocks
= PAGE_SIZE
>> bbits
;
1428 start_block
= offset
>> bbits
;
1429 end_block
= (offset
+ bytes
- 1) >> bbits
;
1430 start_offset
= offset
& (blocksize
- 1);
1431 start_bytes
= blocksize
- start_offset
;
1432 if (start_bytes
> bytes
)
1433 start_bytes
= bytes
;
1434 end_bytes
= (offset
+bytes
) & (blocksize
- 1);
1435 if (end_bytes
> bytes
)
1438 if (offset
< 0 || offset
>= PAGE_SIZE
)
1440 if (bytes
+offset
< 0 || bytes
+offset
> PAGE_SIZE
)
1442 if (start_block
< 0 || start_block
>= blocks
)
1444 if (end_block
< 0 || end_block
>= blocks
)
1446 // FIXME: currently we assume page alignment.
1447 if (page
->offset
& (PAGE_SIZE
-1))
1456 if ((i
< start_block
) || (i
> end_block
)) {
1466 if (end_bytes
&& (i
== end_block
)) {
1471 * Overwritten block.
1475 if (copy_from_user(target_buf
, buf
, len
))
1481 * we dirty buffers only after copying the data into
1482 * the page - this way we can dirty the buffer even if
1483 * the bh is still doing IO.
1486 if (!bh
->b_blocknr
) {
1488 down(&inode
->i_sem
);
1489 phys
= fs_get_block (inode
, block
, 1, &err
, &created
);
1494 init_buffer(bh
, inode
->i_dev
, phys
, end_buffer_io_sync
, NULL
);
1497 * if partially written block which has contents on
1498 * disk, then we have to read it first.
1500 if (!created
&& (start_offset
||
1501 (end_bytes
&& (i
== end_block
)))) {
1503 ll_rw_block(READ
, 1, &bh
);
1506 if (!buffer_uptodate(bh
))
1510 bh
->b_state
= (1<<BH_Uptodate
);
1513 * block already exists, just mark it uptodate:
1515 bh
->b_end_io
= end_buffer_io_sync
;
1516 set_bit(BH_Uptodate
, &bh
->b_state
);
1518 mark_buffer_dirty(bh
, 0);
1522 bh
= bh
->b_this_page
;
1523 } while (bh
!= head
);
1526 SetPageUptodate(page
);
1531 ClearPageUptodate(page
);
1536 * Start I/O on a page.
1537 * This function expects the page to be locked and may return
1538 * before I/O is complete. You then have to check page->locked,
1539 * page->uptodate, and maybe wait on page->wait.
1541 int brw_page(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
, int bmap
)
1543 struct buffer_head
*head
, *bh
, *arr
[MAX_BUF_PER_PAGE
];
1544 int nr
, fresh
, block
;
1546 if (!PageLocked(page
))
1547 panic("brw_page: page not locked for I/O");
1548 // clear_bit(PG_error, &page->flags);
1550 * We pretty much rely on the page lock for this, because
1551 * create_page_buffers() might sleep.
1554 if (!page
->buffers
) {
1555 create_page_buffers(rw
, page
, dev
, b
, size
, bmap
);
1562 head
= page
->buffers
;
1568 if (fresh
&& (bh
->b_count
!= 0))
1573 if (bmap
&& !block
) {
1579 if (!buffer_uptodate(bh
)) {
1583 } else { /* WRITE */
1584 if (!bh
->b_blocknr
) {
1587 bh
->b_blocknr
= block
;
1592 set_bit(BH_Uptodate
, &bh
->b_state
);
1593 mark_buffer_dirty(bh
, 0);
1596 bh
= bh
->b_this_page
;
1597 } while (bh
!= head
);
1600 if ((rw
== READ
) && nr
) {
1601 if (Page_Uptodate(page
))
1604 ll_rw_block(rw
, nr
, arr
);
1607 if (!nr
&& rw
== READ
) {
1608 SetPageUptodate(page
);
1609 page
->owner
= (int)current
;
1612 if (nr
&& (rw
== WRITE
)) {
1614 ll_rw_block(rw
, nr
, arr
);
1622 * This is called by end_request() when I/O has completed.
1624 void mark_buffer_uptodate(struct buffer_head
* bh
, int on
)
1627 struct buffer_head
*tmp
= bh
;
1629 set_bit(BH_Uptodate
, &bh
->b_state
);
1630 /* If a page has buffers and all these buffers are uptodate,
1631 * then the page is uptodate. */
1633 if (!test_bit(BH_Uptodate
, &tmp
->b_state
))
1635 tmp
=tmp
->b_this_page
;
1636 } while (tmp
&& tmp
!= bh
);
1637 page
= mem_map
+ MAP_NR(bh
->b_data
);
1638 SetPageUptodate(page
);
1641 clear_bit(BH_Uptodate
, &bh
->b_state
);
1645 * Generic "readpage" function for block devices that have the normal
1646 * bmap functionality. This is most of the block device filesystems.
1647 * Reads the page asynchronously --- the unlock_buffer() and
1648 * mark_buffer_uptodate() functions propagate buffer state into the
1649 * page struct once IO has completed.
1651 int generic_readpage(struct file
* file
, struct page
* page
)
1653 struct dentry
*dentry
= file
->f_dentry
;
1654 struct inode
*inode
= dentry
->d_inode
;
1655 unsigned long block
;
1656 int *p
, nr
[PAGE_SIZE
/512];
1659 if (page
->buffers
) {
1660 printk("hm, no brw_page(%p) because IO already started.\n",
1665 i
= PAGE_SIZE
>> inode
->i_sb
->s_blocksize_bits
;
1666 block
= page
->offset
>> inode
->i_sb
->s_blocksize_bits
;
1669 *p
= inode
->i_op
->bmap(inode
, block
);
1676 brw_page(READ
, page
, inode
->i_dev
, nr
, inode
->i_sb
->s_blocksize
, 1);
1682 * Try to increase the number of buffers available: the size argument
1683 * is used to determine what kind of buffers we want.
1685 static int grow_buffers(int size
)
1688 struct buffer_head
*bh
, *tmp
;
1689 struct buffer_head
* insert_point
;
1692 if ((size
& 511) || (size
> PAGE_SIZE
)) {
1693 printk("VFS: grow_buffers: size = %d\n",size
);
1697 if (!(page
= __get_free_page(GFP_BUFFER
)))
1699 bh
= create_buffers(page
, size
, 0);
1705 isize
= BUFSIZE_INDEX(size
);
1706 insert_point
= free_list
[isize
];
1711 tmp
->b_next_free
= insert_point
->b_next_free
;
1712 tmp
->b_prev_free
= insert_point
;
1713 insert_point
->b_next_free
->b_prev_free
= tmp
;
1714 insert_point
->b_next_free
= tmp
;
1716 tmp
->b_prev_free
= tmp
;
1717 tmp
->b_next_free
= tmp
;
1720 if (tmp
->b_this_page
)
1721 tmp
= tmp
->b_this_page
;
1725 tmp
->b_this_page
= bh
;
1726 free_list
[isize
] = bh
;
1727 mem_map
[MAP_NR(page
)].buffers
= bh
;
1728 buffermem
+= PAGE_SIZE
;
1733 * Can the buffer be thrown out?
1735 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1736 #define buffer_busy(bh) ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1739 * try_to_free_buffers() checks if all the buffers on this particular page
1740 * are unused, and free's the page if so.
1742 * Wake up bdflush() if this fails - if we're running low on memory due
1743 * to dirty buffers, we need to flush them out as quickly as possible.
1745 int try_to_free_buffers(struct page
* page
)
1747 struct buffer_head
* tmp
, * bh
= page
->buffers
;
1751 struct buffer_head
* p
= tmp
;
1753 tmp
= tmp
->b_this_page
;
1754 if (!buffer_busy(p
))
1759 } while (tmp
!= bh
);
1763 struct buffer_head
* p
= tmp
;
1764 tmp
= tmp
->b_this_page
;
1766 /* The buffer can be either on the regular queues or on the free list.. */
1767 if (p
->b_dev
== B_FREE
)
1768 remove_from_free_list(p
);
1770 remove_from_queues(p
);
1772 put_unused_buffer_head(p
);
1773 } while (tmp
!= bh
);
1775 /* Wake up anyone waiting for buffer heads */
1776 wake_up(&buffer_wait
);
1778 /* And free the page */
1779 page
->buffers
= NULL
;
1780 if (__free_page(page
)) {
1781 buffermem
-= PAGE_SIZE
;
1787 /* ================== Debugging =================== */
1789 void show_buffers(void)
1791 struct buffer_head
* bh
;
1792 int found
= 0, locked
= 0, dirty
= 0, used
= 0, lastused
= 0;
1795 static char *buf_types
[NR_LIST
] = {"CLEAN","LOCKED","DIRTY"};
1797 printk("Buffer memory: %6dkB\n",buffermem
>>10);
1798 printk("Buffer heads: %6d\n",nr_buffer_heads
);
1799 printk("Buffer blocks: %6d\n",nr_buffers
);
1800 printk("Buffer hashed: %6d\n",nr_hashed_buffers
);
1802 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
1803 found
= locked
= dirty
= used
= lastused
= protected = 0;
1804 bh
= lru_list
[nlist
];
1809 if (buffer_locked(bh
))
1811 if (buffer_protected(bh
))
1813 if (buffer_dirty(bh
))
1816 used
++, lastused
= found
;
1817 bh
= bh
->b_next_free
;
1818 } while (bh
!= lru_list
[nlist
]);
1819 printk("%8s: %d buffers, %d used (last=%d), "
1820 "%d locked, %d protected, %d dirty\n",
1821 buf_types
[nlist
], found
, used
, lastused
,
1822 locked
, protected, dirty
);
1827 /* ===================== Init ======================= */
1830 * allocate the hash table and init the free list
1831 * Use gfp() for the hash table to decrease TLB misses, use
1832 * SLAB cache for buffer heads.
1834 void __init
buffer_init(unsigned long memory_size
)
1837 unsigned int nr_hash
;
1839 /* we need to guess at the right sort of size for a buffer cache.
1840 the heuristic from working with large databases and getting
1841 fsync times (ext2) manageable, is the following */
1844 for (order
= 5; (1UL << order
) < memory_size
; order
++);
1846 /* try to allocate something until we get it or we're asking
1847 for something that is really too small */
1850 nr_hash
= (1UL << order
) * PAGE_SIZE
/
1851 sizeof(struct buffer_head
*);
1852 hash_table
= (struct buffer_head
**)
1853 __get_free_pages(GFP_ATOMIC
, order
);
1854 } while (hash_table
== NULL
&& --order
> 4);
1855 printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash
, order
, (1UL<<order
) * PAGE_SIZE
);
1858 panic("Failed to allocate buffer hash table\n");
1859 memset(hash_table
, 0, nr_hash
* sizeof(struct buffer_head
*));
1860 bh_hash_mask
= nr_hash
-1;
1862 bh_cachep
= kmem_cache_create("buffer_head",
1863 sizeof(struct buffer_head
),
1865 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
1867 panic("Cannot create buffer head SLAB cache\n");
1869 * Allocate the reserved buffer heads.
1871 while (nr_buffer_heads
< NR_RESERVED
) {
1872 struct buffer_head
* bh
;
1874 bh
= kmem_cache_alloc(bh_cachep
, SLAB_ATOMIC
);
1877 put_unused_buffer_head(bh
);
1881 lru_list
[BUF_CLEAN
] = 0;
1882 grow_buffers(BLOCK_SIZE
);
1886 /* ====================== bdflush support =================== */
1888 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1889 * response to dirty buffers. Once this process is activated, we write back
1890 * a limited number of buffers to the disks and then go back to sleep again.
1892 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait
);
1893 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done
);
1894 struct task_struct
*bdflush_tsk
= 0;
1896 void wakeup_bdflush(int wait
)
1898 if (current
== bdflush_tsk
)
1901 run_task_queue(&tq_disk
);
1902 wake_up(&bdflush_wait
);
1904 sleep_on(&bdflush_done
);
1909 * Here we attempt to write back old buffers. We also try to flush inodes
1910 * and supers as well, since this function is essentially "update", and
1911 * otherwise there would be no way of ensuring that these quantities ever
1912 * get written back. Ideally, we would have a timestamp on the inodes
1913 * and superblocks so that we could write back only the old ones as well
1916 static int sync_old_buffers(void)
1919 int ndirty
, nwritten
;
1922 struct buffer_head
* bh
, *next
;
1929 for(nlist
= 0; nlist
< NR_LIST
; nlist
++)
1931 for(nlist
= BUF_LOCKED
; nlist
<= BUF_DIRTY
; nlist
++)
1938 bh
= lru_list
[nlist
];
1940 for (i
= nr_buffers_type
[nlist
]; i
-- > 0; bh
= next
) {
1941 /* We may have stalled while waiting for I/O to complete. */
1942 if(bh
->b_list
!= nlist
) goto repeat
;
1943 next
= bh
->b_next_free
;
1944 if(!lru_list
[nlist
]) {
1945 printk("Dirty list empty %d\n", i
);
1949 /* Clean buffer on dirty list? Refile it */
1950 if (nlist
== BUF_DIRTY
&& !buffer_dirty(bh
) && !buffer_locked(bh
)) {
1955 /* Unlocked buffer on locked list? Refile it */
1956 if (nlist
== BUF_LOCKED
&& !buffer_locked(bh
)) {
1961 if (buffer_locked(bh
) || !buffer_dirty(bh
))
1964 if(time_before(jiffies
, bh
->b_flushtime
))
1969 bh
->b_flushtime
= 0;
1971 if(nlist
!= BUF_DIRTY
) ncount
++;
1973 ll_rw_block(WRITE
, 1, &bh
);
1978 run_task_queue(&tq_disk
);
1980 if (ncount
) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount
);
1981 printk("Wrote %d/%d buffers\n", nwritten
, ndirty
);
1983 run_task_queue(&tq_disk
);
1988 /* This is the interface to bdflush. As we get more sophisticated, we can
1989 * pass tuning parameters to this "process", to adjust how it behaves.
1990 * We would want to verify each parameter, however, to make sure that it
1993 asmlinkage
int sys_bdflush(int func
, long data
)
1995 int i
, error
= -EPERM
;
1998 if (!capable(CAP_SYS_ADMIN
))
2002 error
= sync_old_buffers();
2006 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2010 if (i
< 0 || i
>= N_PARAM
)
2012 if((func
& 1) == 0) {
2013 error
= put_user(bdf_prm
.data
[i
], (int*)data
);
2016 if (data
< bdflush_min
[i
] || data
> bdflush_max
[i
])
2018 bdf_prm
.data
[i
] = data
;
2023 /* Having func 0 used to launch the actual bdflush and then never
2024 * return (unless explicitly killed). We return zero here to
2025 * remain semi-compatible with present update(8) programs.
2033 /* This is the actual bdflush daemon itself. It used to be started from
2034 * the syscall above, but now we launch it ourselves internally with
2035 * kernel_thread(...) directly after the first thread in init/main.c */
2037 /* To prevent deadlocks for a loop device:
2038 * 1) Do non-blocking writes to loop (avoids deadlock with running
2039 * out of request blocks).
2040 * 2) But do a blocking write if the only dirty buffers are loop buffers
2041 * (otherwise we go into an infinite busy-loop).
2042 * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
2043 * with running out of free buffers for loop's "real" device).
2045 int bdflush(void * unused
)
2051 struct buffer_head
* bh
, *next
;
2053 int wrta_cmd
= WRITEA
; /* non-blocking write for LOOP */
2056 * We have a bare-bones task_struct, and really should fill
2057 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2058 * display semi-sane things. Not real crucial though...
2061 current
->session
= 1;
2063 sprintf(current
->comm
, "kflushd");
2064 bdflush_tsk
= current
;
2067 * As a kernel thread we want to tamper with system buffers
2068 * and other internals and thus be subject to the SMP locking
2069 * rules. (On a uniprocessor box this does nothing).
2075 printk("bdflush() activated...");
2078 CHECK_EMERGENCY_SYNC
2082 for(nlist
= 0; nlist
< NR_LIST
; nlist
++)
2084 for(nlist
= BUF_LOCKED
; nlist
<= BUF_DIRTY
; nlist
++)
2090 bh
= lru_list
[nlist
];
2092 for (i
= nr_buffers_type
[nlist
]; i
-- > 0 && ndirty
< bdf_prm
.b_un
.ndirty
;
2094 /* We may have stalled while waiting for I/O to complete. */
2095 if(bh
->b_list
!= nlist
) goto repeat
;
2096 next
= bh
->b_next_free
;
2097 if(!lru_list
[nlist
]) {
2098 printk("Dirty list empty %d\n", i
);
2102 /* Clean buffer on dirty list? Refile it */
2103 if (nlist
== BUF_DIRTY
&& !buffer_dirty(bh
)) {
2108 /* Unlocked buffer on locked list? Refile it */
2109 if (nlist
== BUF_LOCKED
&& !buffer_locked(bh
)) {
2114 if (buffer_locked(bh
) || !buffer_dirty(bh
))
2116 major
= MAJOR(bh
->b_dev
);
2117 /* Should we write back buffers that are shared or not??
2118 currently dirty buffers are not shared, so it does not matter */
2122 bh
->b_flushtime
= 0;
2123 if (major
== LOOP_MAJOR
) {
2124 ll_rw_block(wrta_cmd
,1, &bh
);
2126 if (buffer_dirty(bh
))
2130 ll_rw_block(WRITE
, 1, &bh
);
2132 if(nlist
!= BUF_DIRTY
) ncount
++;
2136 wake_up(&buffer_wait
);
2140 if (ncount
) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount
);
2141 printk("sleeping again.\n");
2143 /* If we didn't write anything, but there are still
2144 * dirty buffers, then make the next write to a
2145 * loop device to be a blocking write.
2146 * This lets us block--which we _must_ do! */
2147 if (ndirty
== 0 && nr_buffers_type
[BUF_DIRTY
] > 0 && wrta_cmd
!= WRITE
) {
2151 run_task_queue(&tq_disk
);
2152 wake_up(&bdflush_done
);
2154 /* If there are still a lot of dirty buffers around, skip the sleep
2155 and flush some more */
2156 if(ndirty
== 0 || nr_buffers_type
[BUF_DIRTY
] <= nr_buffers
* bdf_prm
.b_un
.nfract
/100) {
2157 spin_lock_irq(¤t
->sigmask_lock
);
2158 flush_signals(current
);
2159 spin_unlock_irq(¤t
->sigmask_lock
);
2161 interruptible_sleep_on(&bdflush_wait
);