4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
27 /* Thread it... -DaveM */
29 #include <linux/sched.h>
31 #include <linux/malloc.h>
32 #include <linux/locks.h>
33 #include <linux/errno.h>
34 #include <linux/swap.h>
35 #include <linux/swapctl.h>
36 #include <linux/smp_lock.h>
37 #include <linux/vmalloc.h>
38 #include <linux/blkdev.h>
39 #include <linux/sysrq.h>
40 #include <linux/file.h>
41 #include <linux/init.h>
42 #include <linux/quotaops.h>
43 #include <linux/iobuf.h>
45 #include <asm/uaccess.h>
47 #include <asm/bitops.h>
48 #include <asm/mmu_context.h>
51 static char buffersize_index
[65] =
52 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
53 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
54 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
55 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
59 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
60 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
61 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
62 number of unused buffer heads */
64 /* Anti-deadlock ordering:
65 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
71 static unsigned int bh_hash_mask
= 0;
72 static unsigned int bh_hash_shift
= 0;
73 static struct buffer_head
**hash_table
;
74 static rwlock_t hash_table_lock
= RW_LOCK_UNLOCKED
;
76 static struct buffer_head
*lru_list
[NR_LIST
];
77 static spinlock_t lru_list_lock
= SPIN_LOCK_UNLOCKED
;
78 static int nr_buffers_type
[NR_LIST
] = {0,};
80 static struct buffer_head
* unused_list
= NULL
;
81 static int nr_unused_buffer_heads
= 0;
82 static spinlock_t unused_list_lock
= SPIN_LOCK_UNLOCKED
;
83 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait
);
86 struct buffer_head
*list
;
89 static struct bh_free_head free_list
[NR_SIZES
];
91 static kmem_cache_t
*bh_cachep
;
93 static int grow_buffers(int size
);
95 /* This is used by some architectures to estimate available memory. */
96 atomic_t buffermem
= ATOMIC_INIT(0);
98 /* Here is the parameter block for the bdflush process. If you add or
99 * remove any of the parameters, make sure to update kernel/sysctl.c.
104 /* The dummy values in this structure are left in there for compatibility
105 * with old programs that play with the /proc entries.
107 union bdflush_param
{
109 int nfract
; /* Percentage of buffer cache dirty to
111 int ndirty
; /* Maximum number of dirty blocks to write out per
113 int nrefill
; /* Number of clean buffers to try to obtain
114 each time we call refill */
115 int nref_dirt
; /* Dirty buffer threshold for activating bdflush
116 when trying to refill buffers. */
117 int dummy1
; /* unused */
118 int age_buffer
; /* Time for normal buffer to age before we flush it */
119 int age_super
; /* Time for superblock to age before we flush it */
120 int dummy2
; /* unused */
121 int dummy3
; /* unused */
123 unsigned int data
[N_PARAM
];
124 } bdf_prm
= {{40, 500, 64, 256, 15, 30*HZ
, 5*HZ
, 1884, 2}};
126 /* These are the min and max parameter values that we will allow to be assigned */
127 int bdflush_min
[N_PARAM
] = { 0, 10, 5, 25, 0, 1*HZ
, 1*HZ
, 1, 1};
128 int bdflush_max
[N_PARAM
] = {100,50000, 20000, 20000,1000, 6000*HZ
, 6000*HZ
, 2047, 5};
130 void wakeup_bdflush(int);
133 * Rewrote the wait-routines to use the "new" wait-queue functionality,
134 * and getting rid of the cli-sti pairs. The wait-queue routines still
135 * need cli-sti, but now it's just a couple of 386 instructions or so.
137 * Note that the real wait_on_buffer() is an inline function that checks
138 * if 'b_wait' is set before calling this, so that the queues aren't set
141 void __wait_on_buffer(struct buffer_head
* bh
)
143 struct task_struct
*tsk
= current
;
144 DECLARE_WAITQUEUE(wait
, tsk
);
146 atomic_inc(&bh
->b_count
);
147 add_wait_queue(&bh
->b_wait
, &wait
);
149 tsk
->state
= TASK_UNINTERRUPTIBLE
;
150 run_task_queue(&tq_disk
);
151 if (buffer_locked(bh
)) {
155 tsk
->state
= TASK_RUNNING
;
156 remove_wait_queue(&bh
->b_wait
, &wait
);
157 atomic_dec(&bh
->b_count
);
160 /* Call sync_buffers with wait!=0 to ensure that the call does not
161 * return until all buffer writes have completed. Sync() may return
162 * before the writes have finished; fsync() may not.
165 /* Godamity-damn. Some buffers (bitmaps for filesystems)
166 * spontaneously dirty themselves without ever brelse being called.
167 * We will ultimately want to put these in a separate list, but for
168 * now we search all of the lists for dirty buffers.
170 static int sync_buffers(kdev_t dev
, int wait
)
172 int i
, retry
, pass
= 0, err
= 0;
173 struct buffer_head
* bh
, *next
;
175 /* One pass for no-wait, three for wait:
176 * 0) write out all dirty, unlocked buffers;
177 * 1) write out all dirty buffers, waiting if locked;
178 * 2) wait for completion by waiting for all buffers to unlock.
183 /* We search all lists as a failsafe mechanism, not because we expect
184 * there to be dirty buffers on any of the other lists.
187 spin_lock(&lru_list_lock
);
188 bh
= lru_list
[BUF_DIRTY
];
192 for (i
= nr_buffers_type
[BUF_DIRTY
]*2 ; i
-- > 0 ; bh
= next
) {
193 next
= bh
->b_next_free
;
195 if (!lru_list
[BUF_DIRTY
])
197 if (dev
&& bh
->b_dev
!= dev
)
199 if (buffer_locked(bh
)) {
200 /* Buffer is locked; skip it unless wait is
201 * requested AND pass > 0.
203 if (!wait
|| !pass
) {
207 atomic_inc(&bh
->b_count
);
208 spin_unlock(&lru_list_lock
);
210 atomic_dec(&bh
->b_count
);
214 /* If an unlocked buffer is not uptodate, there has
215 * been an IO error. Skip it.
217 if (wait
&& buffer_req(bh
) && !buffer_locked(bh
) &&
218 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) {
223 /* Don't write clean buffers. Don't write ANY buffers
226 if (!buffer_dirty(bh
) || pass
>= 2)
229 atomic_inc(&bh
->b_count
);
231 spin_unlock(&lru_list_lock
);
232 ll_rw_block(WRITE
, 1, &bh
);
233 atomic_dec(&bh
->b_count
);
239 bh
= lru_list
[BUF_LOCKED
];
241 spin_unlock(&lru_list_lock
);
244 for (i
= nr_buffers_type
[BUF_LOCKED
]*2 ; i
-- > 0 ; bh
= next
) {
245 next
= bh
->b_next_free
;
247 if (!lru_list
[BUF_LOCKED
])
249 if (dev
&& bh
->b_dev
!= dev
)
251 if (buffer_locked(bh
)) {
252 /* Buffer is locked; skip it unless wait is
253 * requested AND pass > 0.
255 if (!wait
|| !pass
) {
259 atomic_inc(&bh
->b_count
);
260 spin_unlock(&lru_list_lock
);
262 spin_lock(&lru_list_lock
);
263 atomic_dec(&bh
->b_count
);
267 spin_unlock(&lru_list_lock
);
269 /* If we are waiting for the sync to succeed, and if any dirty
270 * blocks were written, then repeat; on the second pass, only
271 * wait for buffers being written (do not pass to write any
272 * more buffers on the second pass).
274 } while (wait
&& retry
&& ++pass
<=2);
278 void sync_dev(kdev_t dev
)
280 sync_buffers(dev
, 0);
283 sync_buffers(dev
, 0);
286 * FIXME(eric) we need to sync the physical devices here.
287 * This is because some (scsi) controllers have huge amounts of
288 * cache onboard (hundreds of Mb), and we need to instruct
289 * them to commit all of the dirty memory to disk, and we should
290 * not return until this has happened.
292 * This would need to get implemented by going through the assorted
293 * layers so that each block major number can be synced, and this
294 * would call down into the upper and mid-layer scsi.
298 int fsync_dev(kdev_t dev
)
300 sync_buffers(dev
, 0);
308 return sync_buffers(dev
, 1);
311 asmlinkage
int sys_sync(void)
318 * filp may be NULL if called via the msync of a vma.
321 int file_fsync(struct file
*filp
, struct dentry
*dentry
)
323 struct inode
* inode
= dentry
->d_inode
;
324 struct super_block
* sb
;
327 /* sync the inode to buffers */
328 write_inode_now(inode
);
330 /* sync the superblock to buffers */
333 if (sb
->s_op
&& sb
->s_op
->write_super
)
334 sb
->s_op
->write_super(sb
);
336 /* .. finally sync the buffers to disk */
338 return sync_buffers(dev
, 1);
341 asmlinkage
int sys_fsync(unsigned int fd
)
344 struct dentry
* dentry
;
345 struct inode
* inode
;
354 dentry
= file
->f_dentry
;
358 inode
= dentry
->d_inode
;
363 if (!file
->f_op
|| !file
->f_op
->fsync
)
366 /* We need to protect against concurrent writers.. */
368 err
= file
->f_op
->fsync(file
, dentry
);
378 asmlinkage
int sys_fdatasync(unsigned int fd
)
381 struct dentry
* dentry
;
382 struct inode
* inode
;
391 dentry
= file
->f_dentry
;
395 inode
= dentry
->d_inode
;
400 if (!file
->f_op
|| !file
->f_op
->fsync
)
403 /* this needs further work, at the moment it is identical to fsync() */
405 err
= file
->f_op
->fsync(file
, dentry
);
415 void invalidate_buffers(kdev_t dev
)
419 spin_lock(&lru_list_lock
);
420 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
421 struct buffer_head
* bh
;
424 bh
= lru_list
[nlist
];
427 for (i
= nr_buffers_type
[nlist
]*2 ; --i
> 0 ; bh
= bh
->b_next_free
) {
428 if (bh
->b_dev
!= dev
)
430 if (buffer_locked(bh
)) {
431 atomic_inc(&bh
->b_count
);
432 spin_unlock(&lru_list_lock
);
434 spin_lock(&lru_list_lock
);
435 atomic_dec(&bh
->b_count
);
438 if (atomic_read(&bh
->b_count
))
441 clear_bit(BH_Protected
, &bh
->b_state
);
442 clear_bit(BH_Uptodate
, &bh
->b_state
);
443 clear_bit(BH_Dirty
, &bh
->b_state
);
444 clear_bit(BH_Req
, &bh
->b_state
);
447 spin_unlock(&lru_list_lock
);
450 /* After several hours of tedious analysis, the following hash
451 * function won. Do not mess with it... -DaveM
453 #define _hashfn(dev,block) \
454 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
455 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
456 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
458 static __inline__
void __hash_link(struct buffer_head
*bh
, struct buffer_head
**head
)
460 if ((bh
->b_next
= *head
) != NULL
)
461 bh
->b_next
->b_pprev
= &bh
->b_next
;
466 static __inline__
void __hash_unlink(struct buffer_head
*bh
)
469 bh
->b_next
->b_pprev
= bh
->b_pprev
;
470 *(bh
->b_pprev
) = bh
->b_next
;
474 static void __insert_into_lru_list(struct buffer_head
* bh
, int blist
)
476 struct buffer_head
**bhp
= &lru_list
[blist
];
480 bh
->b_prev_free
= bh
;
482 bh
->b_next_free
= *bhp
;
483 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
484 (*bhp
)->b_prev_free
->b_next_free
= bh
;
485 (*bhp
)->b_prev_free
= bh
;
486 nr_buffers_type
[blist
]++;
489 static void __remove_from_lru_list(struct buffer_head
* bh
, int blist
)
491 if (bh
->b_prev_free
|| bh
->b_next_free
) {
492 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
493 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
494 if (lru_list
[blist
] == bh
)
495 lru_list
[blist
] = bh
->b_next_free
;
496 if (lru_list
[blist
] == bh
)
497 lru_list
[blist
] = NULL
;
498 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
499 nr_buffers_type
[blist
]--;
503 static void __remove_from_free_list(struct buffer_head
* bh
, int index
)
505 if(bh
->b_next_free
== bh
)
506 free_list
[index
].list
= NULL
;
508 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
509 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
510 if (free_list
[index
].list
== bh
)
511 free_list
[index
].list
= bh
->b_next_free
;
513 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
516 /* The following two functions must operate atomically
517 * because they control the visibility of a buffer head
518 * to the rest of the kernel.
520 static __inline__
void __remove_from_queues(struct buffer_head
*bh
)
522 write_lock(&hash_table_lock
);
525 __remove_from_lru_list(bh
, bh
->b_list
);
526 write_unlock(&hash_table_lock
);
529 static void insert_into_queues(struct buffer_head
*bh
)
531 struct buffer_head
**head
= &hash(bh
->b_dev
, bh
->b_blocknr
);
533 spin_lock(&lru_list_lock
);
534 write_lock(&hash_table_lock
);
535 __hash_link(bh
, head
);
536 __insert_into_lru_list(bh
, bh
->b_list
);
537 write_unlock(&hash_table_lock
);
538 spin_unlock(&lru_list_lock
);
541 /* This function must only run if there are no other
542 * references _anywhere_ to this buffer head.
544 static void put_last_free(struct buffer_head
* bh
)
546 struct bh_free_head
*head
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)];
547 struct buffer_head
**bhp
= &head
->list
;
549 spin_lock(&head
->lock
);
553 bh
->b_prev_free
= bh
;
555 bh
->b_next_free
= *bhp
;
556 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
557 (*bhp
)->b_prev_free
->b_next_free
= bh
;
558 (*bhp
)->b_prev_free
= bh
;
559 spin_unlock(&head
->lock
);
563 * Why like this, I hear you say... The reason is race-conditions.
564 * As we don't lock buffers (unless we are reading them, that is),
565 * something might happen to it while we sleep (ie a read-error
566 * will force it bad). This shouldn't really happen currently, but
569 struct buffer_head
* get_hash_table(kdev_t dev
, int block
, int size
)
571 struct buffer_head
**head
= &hash(dev
, block
);
572 struct buffer_head
*bh
;
574 read_lock(&hash_table_lock
);
575 for(bh
= *head
; bh
; bh
= bh
->b_next
)
576 if (bh
->b_blocknr
== block
&&
577 bh
->b_size
== size
&&
581 atomic_inc(&bh
->b_count
);
582 read_unlock(&hash_table_lock
);
587 unsigned int get_hardblocksize(kdev_t dev
)
590 * Get the hard sector size for the given device. If we don't know
591 * what it is, return 0.
593 if (hardsect_size
[MAJOR(dev
)] != NULL
) {
594 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)];
600 * We don't know what the hardware sector size for this device is.
601 * Return 0 indicating that we don't know.
606 void set_blocksize(kdev_t dev
, int size
)
608 extern int *blksize_size
[];
610 struct buffer_head
* bh
, *bhnext
;
612 if (!blksize_size
[MAJOR(dev
)])
615 /* Size must be a power of two, and between 512 and PAGE_SIZE */
616 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
617 panic("Invalid blocksize passed to set_blocksize");
619 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == 0 && size
== BLOCK_SIZE
) {
620 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
623 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
)
625 sync_buffers(dev
, 2);
626 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
628 /* We need to be quite careful how we do this - we are moving entries
629 * around on the free list, and we can get in a loop if we are not careful.
631 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
633 spin_lock(&lru_list_lock
);
634 bh
= lru_list
[nlist
];
635 for (i
= nr_buffers_type
[nlist
]*2 ; --i
> 0 ; bh
= bhnext
) {
639 bhnext
= bh
->b_next_free
;
640 if (bh
->b_dev
!= dev
)
642 if (bh
->b_size
== size
)
644 if (buffer_locked(bh
)) {
645 atomic_inc(&bh
->b_count
);
646 spin_unlock(&lru_list_lock
);
648 atomic_dec(&bh
->b_count
);
651 if (bh
->b_dev
== dev
&& bh
->b_size
!= size
) {
652 clear_bit(BH_Dirty
, &bh
->b_state
);
653 clear_bit(BH_Uptodate
, &bh
->b_state
);
654 clear_bit(BH_Req
, &bh
->b_state
);
657 if (atomic_read(&bh
->b_count
) == 0) {
658 __remove_from_queues(bh
);
662 spin_unlock(&lru_list_lock
);
667 * We used to try various strange things. Let's not.
669 static void refill_freelist(int size
)
671 if (!grow_buffers(size
)) {
673 current
->policy
|= SCHED_YIELD
;
678 void init_buffer(struct buffer_head
*bh
, bh_end_io_t
*handler
, void *dev_id
)
680 bh
->b_list
= BUF_CLEAN
;
682 bh
->b_end_io
= handler
;
683 bh
->b_dev_id
= dev_id
;
686 static void end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
688 mark_buffer_uptodate(bh
, uptodate
);
692 static void end_buffer_io_bad(struct buffer_head
*bh
, int uptodate
)
694 mark_buffer_uptodate(bh
, uptodate
);
699 static void end_buffer_io_async(struct buffer_head
* bh
, int uptodate
)
701 static spinlock_t page_uptodate_lock
= SPIN_LOCK_UNLOCKED
;
703 struct buffer_head
*tmp
;
707 mark_buffer_uptodate(bh
, uptodate
);
709 /* This is a temporary buffer used for page I/O. */
710 page
= mem_map
+ MAP_NR(bh
->b_data
);
716 * Be _very_ careful from here on. Bad things can happen if
717 * two buffer heads end IO at almost the same time and both
718 * decide that the page is now completely done.
720 * Async buffer_heads are here only as labels for IO, and get
721 * thrown away once the IO for this page is complete. IO is
722 * deemed complete once all buffers have been visited
723 * (b_count==0) and are now unlocked. We must make sure that
724 * only the _last_ buffer that decrements its count is the one
725 * that free's the page..
727 spin_lock_irqsave(&page_uptodate_lock
, flags
);
729 atomic_dec(&bh
->b_count
);
730 tmp
= bh
->b_this_page
;
732 if (atomic_read(&tmp
->b_count
) &&
733 (tmp
->b_end_io
== end_buffer_io_async
))
735 tmp
= tmp
->b_this_page
;
738 /* OK, the async IO on this page is complete. */
739 spin_unlock_irqrestore(&page_uptodate_lock
, flags
);
742 * if none of the buffers had errors then we can set the
745 if (!PageError(page
))
746 SetPageUptodate(page
);
749 * Run the hooks that have to be done when a page I/O has completed.
751 * Note - we need to test the flags before we unlock the page, but
752 * we must not actually free the page until after the unlock!
754 if (test_and_clear_bit(PG_decr_after
, &page
->flags
))
755 atomic_dec(&nr_async_pages
);
757 if (test_and_clear_bit(PG_free_swap_after
, &page
->flags
))
758 swap_free(page
->offset
);
760 free
= test_and_clear_bit(PG_free_after
, &page
->flags
);
762 if (page
->owner
!= (void *)-1)
764 page
->owner
= current
;
773 spin_unlock_irqrestore(&page_uptodate_lock
, flags
);
779 * Ok, this is getblk, and it isn't very clear, again to hinder
780 * race-conditions. Most of the code is seldom used, (ie repeating),
781 * so it should be much more efficient than it looks.
783 * The algorithm is changed: hopefully better, and an elusive bug removed.
785 * 14.02.92: changed it to sync dirty buffers a bit: better performance
786 * when the filesystem starts to get full of dirty blocks (I hope).
788 struct buffer_head
* getblk(kdev_t dev
, int block
, int size
)
790 struct buffer_head
* bh
;
794 bh
= get_hash_table(dev
, block
, size
);
796 if (!buffer_dirty(bh
)) {
802 isize
= BUFSIZE_INDEX(size
);
803 spin_lock(&free_list
[isize
].lock
);
804 bh
= free_list
[isize
].list
;
806 __remove_from_free_list(bh
, isize
);
807 atomic_set(&bh
->b_count
, 1);
809 spin_unlock(&free_list
[isize
].lock
);
813 /* OK, FINALLY we know that this buffer is the only one of its kind,
814 * we hold a reference (b_count>0), it is unlocked, and it is clean.
816 init_buffer(bh
, end_buffer_io_sync
, NULL
);
818 bh
->b_blocknr
= block
;
819 bh
->b_state
= 1 << BH_Mapped
;
821 /* Insert the buffer into the regular lists */
822 insert_into_queues(bh
);
826 * If we block while refilling the free list, somebody may
827 * create the buffer first ... search the hashes again.
830 refill_freelist(size
);
837 * if a new dirty buffer is created we need to balance bdflush.
839 * in the future we might want to make bdflush aware of different
840 * pressures on different devices - thus the (currently unused)
843 int too_many_dirty_buffers
;
845 void balance_dirty(kdev_t dev
)
847 int dirty
= nr_buffers_type
[BUF_DIRTY
];
848 int ndirty
= bdf_prm
.b_un
.ndirty
;
850 if (dirty
> ndirty
) {
851 if (dirty
> 2*ndirty
) {
852 too_many_dirty_buffers
= 1;
858 too_many_dirty_buffers
= 0;
862 static inline void __mark_dirty(struct buffer_head
*bh
, int flag
)
864 bh
->b_flushtime
= jiffies
+ (flag
? bdf_prm
.b_un
.age_super
: bdf_prm
.b_un
.age_buffer
);
865 clear_bit(BH_New
, &bh
->b_state
);
869 void __mark_buffer_dirty(struct buffer_head
*bh
, int flag
)
871 __mark_dirty(bh
, flag
);
875 * A buffer may need to be moved from one buffer list to another
876 * (e.g. in case it is not shared any more). Handle this.
878 static __inline__
void __refile_buffer(struct buffer_head
*bh
)
880 int dispose
= BUF_CLEAN
;
881 if (buffer_locked(bh
))
882 dispose
= BUF_LOCKED
;
883 if (buffer_dirty(bh
))
885 if (dispose
!= bh
->b_list
) {
886 __remove_from_lru_list(bh
, bh
->b_list
);
887 bh
->b_list
= dispose
;
888 __insert_into_lru_list(bh
, dispose
);
892 void refile_buffer(struct buffer_head
*bh
)
894 spin_lock(&lru_list_lock
);
896 spin_unlock(&lru_list_lock
);
900 * Release a buffer head
902 void __brelse(struct buffer_head
* buf
)
906 if (atomic_read(&buf
->b_count
)) {
907 atomic_dec(&buf
->b_count
);
910 printk("VFS: brelse: Trying to free free buffer\n");
914 * bforget() is like brelse(), except it puts the buffer on the
915 * free list if it can.. We can NOT free the buffer if:
916 * - there are other users of it
917 * - it is locked and thus can have active IO
919 void __bforget(struct buffer_head
* buf
)
921 spin_lock(&lru_list_lock
);
922 write_lock(&hash_table_lock
);
923 if (atomic_read(&buf
->b_count
) != 1 || buffer_locked(buf
)) {
925 atomic_dec(&buf
->b_count
);
927 atomic_set(&buf
->b_count
, 0);
931 __remove_from_lru_list(buf
, buf
->b_list
);
934 write_unlock(&hash_table_lock
);
935 spin_unlock(&lru_list_lock
);
939 * bread() reads a specified block and returns the buffer that contains
940 * it. It returns NULL if the block was unreadable.
942 struct buffer_head
* bread(kdev_t dev
, int block
, int size
)
944 struct buffer_head
* bh
;
946 bh
= getblk(dev
, block
, size
);
947 if (buffer_uptodate(bh
))
949 ll_rw_block(READ
, 1, &bh
);
951 if (buffer_uptodate(bh
))
958 * Ok, breada can be used as bread, but additionally to mark other
959 * blocks for reading as well. End the argument list with a negative
965 struct buffer_head
* breada(kdev_t dev
, int block
, int bufsize
,
966 unsigned int pos
, unsigned int filesize
)
968 struct buffer_head
* bhlist
[NBUF
];
970 struct buffer_head
* bh
;
980 bh
= getblk(dev
, block
, bufsize
);
981 index
= BUFSIZE_INDEX(bh
->b_size
);
983 if (buffer_uptodate(bh
))
985 else ll_rw_block(READ
, 1, &bh
);
987 blocks
= (filesize
- pos
) >> (9+index
);
989 if (blocks
< (read_ahead
[MAJOR(dev
)] >> index
))
990 blocks
= read_ahead
[MAJOR(dev
)] >> index
;
994 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
998 for(i
=1; i
<blocks
; i
++) {
999 bh
= getblk(dev
,block
+i
,bufsize
);
1000 if (buffer_uptodate(bh
)) {
1004 else bhlist
[j
++] = bh
;
1007 /* Request the read for these buffers, and then release them. */
1009 ll_rw_block(READA
, (j
-1), bhlist
+1);
1013 /* Wait for this buffer, and then continue on. */
1016 if (buffer_uptodate(bh
))
1023 * Note: the caller should wake up the buffer_wait list if needed.
1025 static __inline__
void __put_unused_buffer_head(struct buffer_head
* bh
)
1027 if (nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) {
1028 kmem_cache_free(bh_cachep
, bh
);
1031 init_waitqueue_head(&bh
->b_wait
);
1032 nr_unused_buffer_heads
++;
1033 bh
->b_next_free
= unused_list
;
1034 bh
->b_this_page
= NULL
;
1039 static void put_unused_buffer_head(struct buffer_head
*bh
)
1041 spin_lock(&unused_list_lock
);
1042 __put_unused_buffer_head(bh
);
1043 spin_unlock(&unused_list_lock
);
1047 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1048 * no-buffer-head deadlock. Return NULL on failure; waiting for
1049 * buffer heads is now handled in create_buffers().
1051 static struct buffer_head
* get_unused_buffer_head(int async
)
1053 struct buffer_head
* bh
;
1055 spin_lock(&unused_list_lock
);
1056 if (nr_unused_buffer_heads
> NR_RESERVED
) {
1058 unused_list
= bh
->b_next_free
;
1059 nr_unused_buffer_heads
--;
1060 spin_unlock(&unused_list_lock
);
1063 spin_unlock(&unused_list_lock
);
1065 /* This is critical. We can't swap out pages to get
1066 * more buffer heads, because the swap-out may need
1067 * more buffer-heads itself. Thus SLAB_BUFFER.
1069 if((bh
= kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) {
1070 memset(bh
, 0, sizeof(*bh
));
1071 init_waitqueue_head(&bh
->b_wait
);
1076 * If we need an async buffer, use the reserved buffer heads.
1079 spin_lock(&unused_list_lock
);
1082 unused_list
= bh
->b_next_free
;
1083 nr_unused_buffer_heads
--;
1084 spin_unlock(&unused_list_lock
);
1087 spin_unlock(&unused_list_lock
);
1091 * (Pending further analysis ...)
1092 * Ordinary (non-async) requests can use a different memory priority
1093 * to free up pages. Any swapping thus generated will use async
1097 (bh
= kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) {
1098 memset(bh
, 0, sizeof(*bh
));
1099 init_waitqueue_head(&bh
->b_wait
);
1108 * Create the appropriate buffers when given a page for data area and
1109 * the size of each buffer.. Use the bh->b_this_page linked list to
1110 * follow the buffers created. Return NULL if unable to create more
1112 * The async flag is used to differentiate async IO (paging, swapping)
1113 * from ordinary buffer allocations, and only async requests are allowed
1114 * to sleep waiting for buffer heads.
1116 static struct buffer_head
* create_buffers(unsigned long page
, unsigned long size
, int async
)
1118 DECLARE_WAITQUEUE(wait
, current
);
1119 struct buffer_head
*bh
, *head
;
1125 while ((offset
-= size
) >= 0) {
1126 bh
= get_unused_buffer_head(async
);
1130 bh
->b_dev
= B_FREE
; /* Flag as unused */
1131 bh
->b_this_page
= head
;
1135 bh
->b_next_free
= NULL
;
1137 atomic_set(&bh
->b_count
, 0);
1140 bh
->b_data
= (char *) (page
+offset
);
1141 bh
->b_list
= BUF_CLEAN
;
1142 bh
->b_flushtime
= 0;
1143 bh
->b_end_io
= end_buffer_io_bad
;
1147 * In case anything failed, we just free everything we got.
1153 head
= head
->b_this_page
;
1154 put_unused_buffer_head(bh
);
1157 /* Wake up any waiters ... */
1158 wake_up(&buffer_wait
);
1162 * Return failure for non-async IO requests. Async IO requests
1163 * are not allowed to fail, so we have to wait until buffer heads
1164 * become available. But we don't want tasks sleeping with
1165 * partially complete buffers, so all were released above.
1170 /* We're _really_ low on memory. Now we just
1171 * wait for old buffer heads to become free due to
1172 * finishing IO. Since this is an async request and
1173 * the reserve list is empty, we're sure there are
1174 * async buffer heads in use.
1176 run_task_queue(&tq_disk
);
1179 * Set our state for sleeping, then check again for buffer heads.
1180 * This ensures we won't miss a wake_up from an interrupt.
1182 add_wait_queue(&buffer_wait
, &wait
);
1183 current
->state
= TASK_UNINTERRUPTIBLE
;
1184 if (nr_unused_buffer_heads
< MAX_BUF_PER_PAGE
) {
1185 current
->policy
|= SCHED_YIELD
;
1188 remove_wait_queue(&buffer_wait
, &wait
);
1189 current
->state
= TASK_RUNNING
;
1193 static int create_page_buffers(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
, int bmap
)
1195 struct buffer_head
*head
, *bh
, *tail
;
1198 if (!PageLocked(page
))
1200 if (page
->owner
!= current
)
1203 * Allocate async buffer heads pointing to this page, just for I/O.
1204 * They show up in the buffer hash table and are registered in
1207 head
= create_buffers(page_address(page
), size
, 1);
1213 for (bh
= head
; bh
; bh
= bh
->b_this_page
) {
1217 init_buffer(bh
, end_buffer_io_async
, NULL
);
1219 bh
->b_blocknr
= block
;
1222 * When we use bmap, we define block zero to represent
1223 * a hole. ll_rw_page, however, may legitimately
1224 * access block zero, and we need to distinguish the
1227 if (bmap
&& !block
) {
1228 memset(bh
->b_data
, 0, size
);
1229 set_bit(BH_Uptodate
, &bh
->b_state
);
1232 set_bit(BH_Mapped
, &bh
->b_state
);
1234 tail
->b_this_page
= head
;
1236 page
->buffers
= head
;
1241 * We don't have to release all buffers here, but
1242 * we have to be sure that no dirty buffer is left
1243 * and no IO is going on (no buffer is locked), because
1244 * we have truncated the file and are going to free the
1247 int block_flushpage(struct inode
*inode
, struct page
*page
, unsigned long offset
)
1249 struct buffer_head
*head
, *bh
, *next
;
1250 unsigned int curr_off
= 0;
1252 if (!PageLocked(page
))
1257 head
= page
->buffers
;
1260 unsigned int next_off
= curr_off
+ bh
->b_size
;
1261 next
= bh
->b_this_page
;
1264 * is this block fully flushed?
1266 if (offset
<= curr_off
) {
1267 if (buffer_mapped(bh
)) {
1268 atomic_inc(&bh
->b_count
);
1270 if (bh
->b_dev
== B_FREE
)
1272 mark_buffer_clean(bh
);
1273 clear_bit(BH_Uptodate
, &bh
->b_state
);
1274 clear_bit(BH_Mapped
, &bh
->b_state
);
1275 clear_bit(BH_Req
, &bh
->b_state
);
1277 atomic_dec(&bh
->b_count
);
1280 curr_off
= next_off
;
1282 } while (bh
!= head
);
1285 * subtle. We release buffer-heads only if this is
1286 * the 'final' flushpage. We have invalidated the bmap
1287 * cached value unconditionally, so real IO is not
1290 * If the free doesn't work out, the buffers can be
1291 * left around - they just turn into anonymous buffers
1295 if (!try_to_free_buffers(page
))
1296 atomic_add(PAGE_CACHE_SIZE
, &buffermem
);
1302 static void create_empty_buffers(struct page
*page
, struct inode
*inode
, unsigned long blocksize
)
1304 struct buffer_head
*bh
, *head
, *tail
;
1306 head
= create_buffers(page_address(page
), blocksize
, 1);
1312 bh
->b_dev
= inode
->i_dev
;
1314 bh
->b_end_io
= end_buffer_io_bad
;
1316 bh
= bh
->b_this_page
;
1318 tail
->b_this_page
= head
;
1319 page
->buffers
= head
;
1324 * block_write_full_page() is SMP-safe - currently it's still
1325 * being called with the kernel lock held, but the code is ready.
1327 int block_write_full_page(struct file
*file
, struct page
*page
)
1329 struct dentry
*dentry
= file
->f_dentry
;
1330 struct inode
*inode
= dentry
->d_inode
;
1332 unsigned long block
, offset
;
1333 struct buffer_head
*bh
, *head
;
1335 if (!PageLocked(page
))
1339 create_empty_buffers(page
, inode
, inode
->i_sb
->s_blocksize
);
1340 head
= page
->buffers
;
1342 offset
= page
->offset
;
1343 block
= offset
>> inode
->i_sb
->s_blocksize_bits
;
1345 // FIXME: currently we assume page alignment.
1346 if (offset
& (PAGE_SIZE
-1))
1356 * If the buffer isn't up-to-date, we can't be sure
1357 * that the buffer has been initialized with the proper
1358 * block number information etc..
1360 * Leave it to the low-level FS to make all those
1361 * decisions (block #0 may actually be a valid block)
1363 bh
->b_end_io
= end_buffer_io_sync
;
1364 if (!buffer_mapped(bh
)) {
1365 err
= inode
->i_op
->get_block(inode
, block
, bh
, 1);
1369 set_bit(BH_Uptodate
, &bh
->b_state
);
1370 mark_buffer_dirty(bh
,0);
1372 bh
= bh
->b_this_page
;
1374 } while (bh
!= head
);
1376 SetPageUptodate(page
);
1379 ClearPageUptodate(page
);
1383 int block_write_partial_page(struct file
*file
, struct page
*page
, unsigned long offset
, unsigned long bytes
, const char * buf
)
1385 struct dentry
*dentry
= file
->f_dentry
;
1386 struct inode
*inode
= dentry
->d_inode
;
1387 unsigned long block
;
1389 unsigned long blocksize
, start_block
, end_block
;
1390 unsigned long start_offset
, start_bytes
, end_bytes
;
1391 unsigned long bbits
, blocks
, i
, len
;
1392 struct buffer_head
*bh
, *head
;
1395 target_buf
= (char *)page_address(page
) + offset
;
1397 if (!PageLocked(page
))
1400 blocksize
= inode
->i_sb
->s_blocksize
;
1402 create_empty_buffers(page
, inode
, blocksize
);
1403 head
= page
->buffers
;
1405 bbits
= inode
->i_sb
->s_blocksize_bits
;
1406 block
= page
->offset
>> bbits
;
1407 blocks
= PAGE_SIZE
>> bbits
;
1408 start_block
= offset
>> bbits
;
1409 end_block
= (offset
+ bytes
- 1) >> bbits
;
1410 start_offset
= offset
& (blocksize
- 1);
1411 start_bytes
= blocksize
- start_offset
;
1412 if (start_bytes
> bytes
)
1413 start_bytes
= bytes
;
1414 end_bytes
= (offset
+bytes
) & (blocksize
- 1);
1415 if (end_bytes
> bytes
)
1418 if (offset
< 0 || offset
>= PAGE_SIZE
)
1420 if (bytes
+offset
< 0 || bytes
+offset
> PAGE_SIZE
)
1422 if (start_block
< 0 || start_block
>= blocks
)
1424 if (end_block
< 0 || end_block
>= blocks
)
1426 // FIXME: currently we assume page alignment.
1427 if (page
->offset
& (PAGE_SIZE
-1))
1437 if ((i
< start_block
) || (i
> end_block
)) {
1438 if (!buffer_uptodate(bh
))
1444 * If the buffer is not up-to-date, we need to ask the low-level
1445 * FS to do something for us (we used to have assumptions about
1446 * the meaning of b_blocknr etc, that's bad).
1448 * If "update" is set, that means that the low-level FS should
1449 * try to make sure that the block is up-to-date because we're
1450 * not going to fill it completely.
1452 bh
->b_end_io
= end_buffer_io_sync
;
1453 if (!buffer_mapped(bh
)) {
1454 err
= inode
->i_op
->get_block(inode
, block
, bh
, 1);
1459 if (!buffer_uptodate(bh
) && (start_offset
|| (end_bytes
&& (i
== end_block
)))) {
1460 if (buffer_new(bh
)) {
1461 memset(bh
->b_data
, 0, bh
->b_size
);
1463 ll_rw_block(READ
, 1, &bh
);
1466 if (!buffer_uptodate(bh
))
1475 } else if (end_bytes
&& (i
== end_block
)) {
1479 err
= copy_from_user(target_buf
, buf
, len
);
1484 * we dirty buffers only after copying the data into
1485 * the page - this way we can dirty the buffer even if
1486 * the bh is still doing IO.
1488 * NOTE! This also does a direct dirty balace check,
1489 * rather than relying on bdflush just waking up every
1490 * once in a while. This is to catch (and slow down)
1491 * the processes that write tons of buffer..
1493 * Note how we do NOT want to do this in the full block
1494 * case: full pages are flushed not by the people who
1495 * dirtied them, but by people who need memory. And we
1496 * should not penalize them for somebody else writing
1497 * lots of dirty pages.
1499 set_bit(BH_Uptodate
, &bh
->b_state
);
1500 if (!test_and_set_bit(BH_Dirty
, &bh
->b_state
)) {
1501 __mark_dirty(bh
, 0);
1502 if (too_many_dirty_buffers
)
1503 balance_dirty(bh
->b_dev
);
1514 bh
= bh
->b_this_page
;
1515 } while (bh
!= head
);
1518 * is this a partial write that happened to make all buffers
1519 * uptodate then we can optimize away a bogus readpage() for
1520 * the next read(). Here we 'discover' wether the page went
1521 * uptodate as a result of this (potentially partial) write.
1524 SetPageUptodate(page
);
1527 ClearPageUptodate(page
);
1533 * IO completion routine for a buffer_head being used for kiobuf IO: we
1534 * can't dispatch the kiobuf callback until io_count reaches 0.
1537 static void end_buffer_io_kiobuf(struct buffer_head
*bh
, int uptodate
)
1539 struct kiobuf
*kiobuf
;
1541 mark_buffer_uptodate(bh
, uptodate
);
1543 kiobuf
= bh
->b_kiobuf
;
1544 if (atomic_dec_and_test(&kiobuf
->io_count
))
1545 kiobuf
->end_io(kiobuf
);
1547 kiobuf
->errno
= -EIO
;
1552 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1553 * for them to complete. Clean up the buffer_heads afterwards.
1556 #define dprintk(x...)
1558 static int do_kio(struct kiobuf
*kiobuf
,
1559 int rw
, int nr
, struct buffer_head
*bh
[], int size
)
1563 struct buffer_head
*tmp
;
1565 struct task_struct
*tsk
= current
;
1566 DECLARE_WAITQUEUE(wait
, tsk
);
1568 dprintk ("do_kio start %d\n", rw
);
1572 atomic_add(nr
, &kiobuf
->io_count
);
1574 ll_rw_block(rw
, nr
, bh
);
1576 kiobuf_wait_for_io(kiobuf
);
1578 spin_lock(&unused_list_lock
);
1581 for (i
= nr
; --i
>= 0; ) {
1584 if (!buffer_uptodate(tmp
)) {
1585 /* We are traversing bh'es in reverse order so
1586 clearing iosize on error calculates the
1587 amount of IO before the first error. */
1590 __put_unused_buffer_head(tmp
);
1593 spin_unlock(&unused_list_lock
);
1595 dprintk ("do_kio end %d %d\n", iosize
, err
);
1600 return kiobuf
->errno
;
1605 * Start I/O on a physical range of kernel memory, defined by a vector
1606 * of kiobuf structs (much like a user-space iovec list).
1608 * The kiobuf must already be locked for IO. IO is submitted
1609 * asynchronously: you need to check page->locked, page->uptodate, and
1610 * maybe wait on page->wait.
1612 * It is up to the caller to make sure that there are enough blocks
1613 * passed in to completely map the iobufs to disk.
1616 int brw_kiovec(int rw
, int nr
, struct kiobuf
*iovec
[],
1617 kdev_t dev
, unsigned long b
[], int size
, int bmap
)
1627 unsigned long blocknr
;
1628 struct kiobuf
* iobuf
= NULL
;
1631 struct buffer_head
*tmp
, *bh
[KIO_MAX_SECTORS
];
1637 * First, do some alignment and validity checks
1639 for (i
= 0; i
< nr
; i
++) {
1641 if ((iobuf
->offset
& (size
-1)) ||
1642 (iobuf
->length
& (size
-1)))
1645 panic("brw_kiovec: iobuf not locked for I/O");
1646 if (!iobuf
->nr_pages
)
1647 panic("brw_kiovec: iobuf not initialised");
1652 return iobuf
->length
;
1654 dprintk ("brw_kiovec: start\n");
1657 * OK to walk down the iovec doing page IO on each page we find.
1659 bufind
= bhind
= transferred
= err
= 0;
1660 for (i
= 0; i
< nr
; i
++) {
1662 offset
= iobuf
->offset
;
1663 length
= iobuf
->length
;
1664 dprintk ("iobuf %d %d %d\n", offset
, length
, size
);
1666 for (pageind
= 0; pageind
< iobuf
->nr_pages
; pageind
++) {
1667 page
= iobuf
->pagelist
[pageind
];
1668 map
= iobuf
->maplist
[pageind
];
1670 while (length
> 0) {
1671 blocknr
= b
[bufind
++];
1672 tmp
= get_unused_buffer_head(0);
1678 tmp
->b_dev
= B_FREE
;
1680 tmp
->b_data
= (char *) (page
+ offset
);
1681 tmp
->b_this_page
= tmp
;
1683 init_buffer(tmp
, end_buffer_io_kiobuf
, NULL
);
1685 tmp
->b_blocknr
= blocknr
;
1686 tmp
->b_state
= 1 << BH_Mapped
;
1687 tmp
->b_kiobuf
= iobuf
;
1690 set_bit(BH_Uptodate
, &tmp
->b_state
);
1691 set_bit(BH_Dirty
, &tmp
->b_state
);
1694 dprintk ("buffer %d (%d) at %p\n",
1695 bhind
, tmp
->b_blocknr
, tmp
->b_data
);
1701 * Start the IO if we have got too much
1703 if (bhind
>= KIO_MAX_SECTORS
) {
1704 err
= do_kio(iobuf
, rw
, bhind
, bh
, size
);
1712 if (offset
>= PAGE_SIZE
) {
1716 } /* End of block loop */
1717 } /* End of page loop */
1718 } /* End of iovec loop */
1720 /* Is there any IO still left to submit? */
1722 err
= do_kio(iobuf
, rw
, bhind
, bh
, size
);
1730 dprintk ("brw_kiovec: end (%d, %d)\n", transferred
, err
);
1736 /* We got an error allocation the bh'es. Just free the current
1737 buffer_heads and exit. */
1738 spin_lock(&unused_list_lock
);
1739 for (i
= bhind
; --i
>= 0; ) {
1740 __put_unused_buffer_head(bh
[bhind
]);
1742 spin_unlock(&unused_list_lock
);
1747 * Start I/O on a page.
1748 * This function expects the page to be locked and may return
1749 * before I/O is complete. You then have to check page->locked,
1750 * page->uptodate, and maybe wait on page->wait.
1752 * brw_page() is SMP-safe, although it's being called with the
1753 * kernel lock held - but the code is ready.
1755 * FIXME: we need a swapper_inode->get_block function to remove
1756 * some of the bmap kludges and interface ugliness here.
1758 int brw_page(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
, int bmap
)
1760 struct buffer_head
*head
, *bh
, *arr
[MAX_BUF_PER_PAGE
];
1761 int nr
, fresh
/* temporary debugging flag */, block
;
1763 if (!PageLocked(page
))
1764 panic("brw_page: page not locked for I/O");
1765 // clear_bit(PG_error, &page->flags);
1767 * We pretty much rely on the page lock for this, because
1768 * create_page_buffers() might sleep.
1771 if (!page
->buffers
) {
1772 create_page_buffers(rw
, page
, dev
, b
, size
, bmap
);
1777 page
->owner
= (void *)-1;
1779 head
= page
->buffers
;
1785 if (fresh
&& (atomic_read(&bh
->b_count
) != 0))
1790 if (bmap
&& !block
) {
1796 if (!buffer_uptodate(bh
)) {
1798 atomic_inc(&bh
->b_count
);
1801 } else { /* WRITE */
1802 if (!bh
->b_blocknr
) {
1805 bh
->b_blocknr
= block
;
1810 set_bit(BH_Uptodate
, &bh
->b_state
);
1811 set_bit(BH_Dirty
, &bh
->b_state
);
1813 atomic_inc(&bh
->b_count
);
1815 bh
= bh
->b_this_page
;
1816 } while (bh
!= head
);
1819 if ((rw
== READ
) && nr
) {
1820 if (Page_Uptodate(page
))
1822 ll_rw_block(rw
, nr
, arr
);
1824 if (!nr
&& rw
== READ
) {
1825 SetPageUptodate(page
);
1826 page
->owner
= current
;
1829 if (nr
&& (rw
== WRITE
))
1830 ll_rw_block(rw
, nr
, arr
);
1836 * Generic "read page" function for block devices that have the normal
1837 * bmap functionality. This is most of the block device filesystems.
1838 * Reads the page asynchronously --- the unlock_buffer() and
1839 * mark_buffer_uptodate() functions propagate buffer state into the
1840 * page struct once IO has completed.
1842 int block_read_full_page(struct file
* file
, struct page
* page
)
1844 struct dentry
*dentry
= file
->f_dentry
;
1845 struct inode
*inode
= dentry
->d_inode
;
1846 unsigned long iblock
;
1847 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
];
1848 unsigned int blocksize
, blocks
;
1851 if (!PageLocked(page
))
1853 blocksize
= inode
->i_sb
->s_blocksize
;
1855 create_empty_buffers(page
, inode
, blocksize
);
1856 head
= page
->buffers
;
1858 blocks
= PAGE_SIZE
>> inode
->i_sb
->s_blocksize_bits
;
1859 iblock
= page
->offset
>> inode
->i_sb
->s_blocksize_bits
;
1860 page
->owner
= (void *)-1;
1861 head
= page
->buffers
;
1866 if (buffer_uptodate(bh
))
1869 if (!buffer_mapped(bh
)) {
1870 inode
->i_op
->get_block(inode
, iblock
, bh
, 0);
1871 if (!buffer_mapped(bh
)) {
1872 memset(bh
->b_data
, 0, blocksize
);
1873 set_bit(BH_Uptodate
, &bh
->b_state
);
1878 init_buffer(bh
, end_buffer_io_async
, NULL
);
1879 atomic_inc(&bh
->b_count
);
1882 } while (iblock
++, (bh
= bh
->b_this_page
) != head
);
1886 if (Page_Uptodate(page
))
1888 ll_rw_block(READ
, nr
, arr
);
1891 * all buffers are uptodate - we can set the page
1894 SetPageUptodate(page
);
1895 page
->owner
= current
;
1902 * Try to increase the number of buffers available: the size argument
1903 * is used to determine what kind of buffers we want.
1905 static int grow_buffers(int size
)
1908 struct buffer_head
*bh
, *tmp
;
1909 struct buffer_head
* insert_point
;
1912 if ((size
& 511) || (size
> PAGE_SIZE
)) {
1913 printk("VFS: grow_buffers: size = %d\n",size
);
1917 if (!(page
= __get_free_page(GFP_BUFFER
)))
1919 bh
= create_buffers(page
, size
, 0);
1925 isize
= BUFSIZE_INDEX(size
);
1927 spin_lock(&free_list
[isize
].lock
);
1928 insert_point
= free_list
[isize
].list
;
1932 tmp
->b_next_free
= insert_point
->b_next_free
;
1933 tmp
->b_prev_free
= insert_point
;
1934 insert_point
->b_next_free
->b_prev_free
= tmp
;
1935 insert_point
->b_next_free
= tmp
;
1937 tmp
->b_prev_free
= tmp
;
1938 tmp
->b_next_free
= tmp
;
1941 if (tmp
->b_this_page
)
1942 tmp
= tmp
->b_this_page
;
1946 tmp
->b_this_page
= bh
;
1947 free_list
[isize
].list
= bh
;
1948 spin_unlock(&free_list
[isize
].lock
);
1950 mem_map
[MAP_NR(page
)].buffers
= bh
;
1951 atomic_add(PAGE_SIZE
, &buffermem
);
1956 * Can the buffer be thrown out?
1958 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1959 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1962 * try_to_free_buffers() checks if all the buffers on this particular page
1963 * are unused, and free's the page if so.
1965 * Wake up bdflush() if this fails - if we're running low on memory due
1966 * to dirty buffers, we need to flush them out as quickly as possible.
1968 * NOTE: There are quite a number of ways that threads of control can
1969 * obtain a reference to a buffer head within a page. So we must
1970 * lock out all of these paths to cleanly toss the page.
1972 int try_to_free_buffers(struct page
* page
)
1974 struct buffer_head
* tmp
, * bh
= page
->buffers
;
1975 int index
= BUFSIZE_INDEX(bh
->b_size
);
1978 spin_lock(&lru_list_lock
);
1979 write_lock(&hash_table_lock
);
1980 spin_lock(&free_list
[index
].lock
);
1983 struct buffer_head
* p
= tmp
;
1985 tmp
= tmp
->b_this_page
;
1987 goto busy_buffer_page
;
1988 } while (tmp
!= bh
);
1990 spin_lock(&unused_list_lock
);
1993 struct buffer_head
* p
= tmp
;
1994 tmp
= tmp
->b_this_page
;
1996 /* The buffer can be either on the regular
1997 * queues or on the free list..
1999 if (p
->b_dev
== B_FREE
) {
2000 __remove_from_free_list(p
, index
);
2004 __remove_from_lru_list(p
, p
->b_list
);
2006 __put_unused_buffer_head(p
);
2007 } while (tmp
!= bh
);
2008 spin_unlock(&unused_list_lock
);
2010 /* Wake up anyone waiting for buffer heads */
2011 wake_up(&buffer_wait
);
2013 /* And free the page */
2014 page
->buffers
= NULL
;
2018 spin_unlock(&free_list
[index
].lock
);
2019 write_unlock(&hash_table_lock
);
2020 spin_unlock(&lru_list_lock
);
2024 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2025 too_many_dirty_buffers
= 1;
2031 /* ===================== Init ======================= */
2034 * allocate the hash table and init the free list
2035 * Use gfp() for the hash table to decrease TLB misses, use
2036 * SLAB cache for buffer heads.
2038 void __init
buffer_init(unsigned long memory_size
)
2041 unsigned int nr_hash
;
2043 /* The buffer cache hash table is less important these days,
2047 memory_size
*= sizeof(struct buffer_head
*);
2048 for (order
= 0; (PAGE_SIZE
<< order
) < memory_size
; order
++)
2051 /* try to allocate something until we get it or we're asking
2052 for something that is really too small */
2057 nr_hash
= (PAGE_SIZE
<< order
) / sizeof(struct buffer_head
*);
2058 bh_hash_mask
= (nr_hash
- 1);
2062 while((tmp
>>= 1UL) != 0UL)
2065 hash_table
= (struct buffer_head
**)
2066 __get_free_pages(GFP_ATOMIC
, order
);
2067 } while (hash_table
== NULL
&& --order
> 0);
2068 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2069 nr_hash
, order
, (1UL<<order
) * PAGE_SIZE
);
2072 panic("Failed to allocate buffer hash table\n");
2074 /* Setup hash chains. */
2075 for(i
= 0; i
< nr_hash
; i
++)
2076 hash_table
[i
] = NULL
;
2078 /* Setup free lists. */
2079 for(i
= 0; i
< NR_SIZES
; i
++) {
2080 free_list
[i
].list
= NULL
;
2081 free_list
[i
].lock
= SPIN_LOCK_UNLOCKED
;
2084 /* Setup lru lists. */
2085 for(i
= 0; i
< NR_LIST
; i
++)
2088 bh_cachep
= kmem_cache_create("buffer_head",
2089 sizeof(struct buffer_head
),
2091 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
2093 panic("Cannot create buffer head SLAB cache\n");
2097 /* ====================== bdflush support =================== */
2099 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2100 * response to dirty buffers. Once this process is activated, we write back
2101 * a limited number of buffers to the disks and then go back to sleep again.
2103 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait
);
2104 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done
);
2105 struct task_struct
*bdflush_tsk
= 0;
2107 void wakeup_bdflush(int wait
)
2109 if (current
== bdflush_tsk
)
2112 run_task_queue(&tq_disk
);
2113 wake_up(&bdflush_wait
);
2115 sleep_on(&bdflush_done
);
2120 * Here we attempt to write back old buffers. We also try to flush inodes
2121 * and supers as well, since this function is essentially "update", and
2122 * otherwise there would be no way of ensuring that these quantities ever
2123 * get written back. Ideally, we would have a timestamp on the inodes
2124 * and superblocks so that we could write back only the old ones as well
2127 static int sync_old_buffers(void)
2136 for(nlist
= BUF_LOCKED
; nlist
<= BUF_DIRTY
; nlist
++) {
2137 struct buffer_head
*bh
;
2139 spin_lock(&lru_list_lock
);
2140 bh
= lru_list
[nlist
];
2142 struct buffer_head
*next
;
2144 for (i
= nr_buffers_type
[nlist
]; i
-- > 0; bh
= next
) {
2145 next
= bh
->b_next_free
;
2147 /* If the buffer is not on the proper list,
2150 if ((nlist
== BUF_DIRTY
&&
2151 (!buffer_dirty(bh
) && !buffer_locked(bh
))) ||
2152 (nlist
== BUF_LOCKED
&& !buffer_locked(bh
))) {
2153 __refile_buffer(bh
);
2157 if (buffer_locked(bh
) || !buffer_dirty(bh
))
2160 /* OK, now we are committed to write it out. */
2161 bh
->b_flushtime
= 0;
2162 atomic_inc(&bh
->b_count
);
2163 spin_unlock(&lru_list_lock
);
2164 ll_rw_block(WRITE
, 1, &bh
);
2165 atomic_dec(&bh
->b_count
);
2169 spin_unlock(&lru_list_lock
);
2171 run_task_queue(&tq_disk
);
2175 /* This is the interface to bdflush. As we get more sophisticated, we can
2176 * pass tuning parameters to this "process", to adjust how it behaves.
2177 * We would want to verify each parameter, however, to make sure that it
2180 asmlinkage
int sys_bdflush(int func
, long data
)
2182 if (!capable(CAP_SYS_ADMIN
))
2187 struct mm_struct
*user_mm
;
2190 * bdflush will spend all of it's time in kernel-space,
2191 * without touching user-space, so we can switch it into
2192 * 'lazy TLB mode' to reduce the cost of context-switches
2193 * to and from bdflush.
2195 user_mm
= start_lazy_tlb();
2196 error
= sync_old_buffers();
2197 end_lazy_tlb(user_mm
);
2201 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2203 int i
= (func
-2) >> 1;
2204 if (i
>= 0 && i
< N_PARAM
) {
2205 if ((func
& 1) == 0)
2206 return put_user(bdf_prm
.data
[i
], (int*)data
);
2208 if (data
>= bdflush_min
[i
] && data
<= bdflush_max
[i
]) {
2209 bdf_prm
.data
[i
] = data
;
2216 /* Having func 0 used to launch the actual bdflush and then never
2217 * return (unless explicitly killed). We return zero here to
2218 * remain semi-compatible with present update(8) programs.
2224 * This is the actual bdflush daemon itself. It used to be started from
2225 * the syscall above, but now we launch it ourselves internally with
2226 * kernel_thread(...) directly after the first thread in init/main.c
2228 int bdflush(void * unused
)
2231 * We have a bare-bones task_struct, and really should fill
2232 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2233 * display semi-sane things. Not real crucial though...
2236 current
->session
= 1;
2238 sprintf(current
->comm
, "kflushd");
2239 bdflush_tsk
= current
;
2244 CHECK_EMERGENCY_SYNC
2246 for(nlist
= BUF_LOCKED
; nlist
<= BUF_DIRTY
; nlist
++) {
2247 int nr
, major
, written
= 0;
2248 struct buffer_head
*next
;
2251 spin_lock(&lru_list_lock
);
2252 next
= lru_list
[nlist
];
2253 nr
= nr_buffers_type
[nlist
];
2255 struct buffer_head
*bh
= next
;
2257 next
= next
->b_next_free
;
2259 /* If the buffer is not on the correct list,
2262 if ((nlist
== BUF_DIRTY
&&
2263 (!buffer_dirty(bh
) && !buffer_locked(bh
))) ||
2264 (nlist
== BUF_LOCKED
&& !buffer_locked(bh
))) {
2265 __refile_buffer(bh
);
2269 /* If we aren't in panic mode, don't write out too much
2270 * at a time. Also, don't write out buffers we don't
2271 * really have to write out yet..
2273 if (!too_many_dirty_buffers
) {
2274 if (written
> bdf_prm
.b_un
.ndirty
)
2276 if (time_before(jiffies
, bh
->b_flushtime
))
2280 if (buffer_locked(bh
) || !buffer_dirty(bh
))
2283 major
= MAJOR(bh
->b_dev
);
2285 bh
->b_flushtime
= 0;
2288 * For the loop major we can try to do asynchronous writes,
2289 * but we have to guarantee that we're making some progress..
2291 atomic_inc(&bh
->b_count
);
2292 spin_unlock(&lru_list_lock
);
2293 if (major
== LOOP_MAJOR
&& written
> 1) {
2294 ll_rw_block(WRITEA
, 1, &bh
);
2295 if (buffer_dirty(bh
))
2298 ll_rw_block(WRITE
, 1, &bh
);
2299 atomic_dec(&bh
->b_count
);
2302 spin_unlock(&lru_list_lock
);
2304 run_task_queue(&tq_disk
);
2305 wake_up(&bdflush_done
);
2308 * If there are still a lot of dirty buffers around,
2309 * skip the sleep and flush some more. Otherwise, we
2310 * sleep for a while and mark us as not being in panic
2313 if (!too_many_dirty_buffers
|| nr_buffers_type
[BUF_DIRTY
] < bdf_prm
.b_un
.ndirty
) {
2314 too_many_dirty_buffers
= 0;
2315 spin_lock_irq(¤t
->sigmask_lock
);
2316 flush_signals(current
);
2317 spin_unlock_irq(¤t
->sigmask_lock
);
2318 interruptible_sleep_on_timeout(&bdflush_wait
, 5*HZ
);