4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
54 static char buffersize_index
[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
74 static unsigned int bh_hash_mask
;
75 static unsigned int bh_hash_shift
;
76 static struct buffer_head
**hash_table
;
77 static rwlock_t hash_table_lock
= RW_LOCK_UNLOCKED
;
79 static struct buffer_head
*lru_list
[NR_LIST
];
80 static spinlock_t lru_list_lock
= SPIN_LOCK_UNLOCKED
;
81 static int nr_buffers_type
[NR_LIST
];
82 static unsigned long size_buffers_type
[NR_LIST
];
84 static struct buffer_head
* unused_list
;
85 static int nr_unused_buffer_heads
;
86 static spinlock_t unused_list_lock
= SPIN_LOCK_UNLOCKED
;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait
);
90 struct buffer_head
*list
;
93 static struct bh_free_head free_list
[NR_SIZES
];
95 kmem_cache_t
*bh_cachep
;
97 static int grow_buffers(int size
);
98 static void __refile_buffer(struct buffer_head
*);
100 /* This is used by some architectures to estimate available memory. */
101 atomic_t buffermem_pages
= ATOMIC_INIT(0);
103 /* Here is the parameter block for the bdflush process. If you add or
104 * remove any of the parameters, make sure to update kernel/sysctl.c.
109 /* The dummy values in this structure are left in there for compatibility
110 * with old programs that play with the /proc entries.
112 union bdflush_param
{
114 int nfract
; /* Percentage of buffer cache dirty to
116 int ndirty
; /* Maximum number of dirty blocks to write out per
118 int nrefill
; /* Number of clean buffers to try to obtain
119 each time we call refill */
120 int nref_dirt
; /* Dirty buffer threshold for activating bdflush
121 when trying to refill buffers. */
122 int interval
; /* jiffies delay between kupdate flushes */
123 int age_buffer
; /* Time for normal buffer to age before we flush it */
124 int age_super
; /* Time for superblock to age before we flush it */
125 int dummy2
; /* unused */
126 int dummy3
; /* unused */
128 unsigned int data
[N_PARAM
];
129 } bdf_prm
= {{40, 500, 64, 256, 5*HZ
, 30*HZ
, 5*HZ
, 1884, 2}};
131 /* These are the min and max parameter values that we will allow to be assigned */
132 int bdflush_min
[N_PARAM
] = { 0, 10, 5, 25, 0, 1*HZ
, 1*HZ
, 1, 1};
133 int bdflush_max
[N_PARAM
] = {100,50000, 20000, 20000,600*HZ
, 6000*HZ
, 6000*HZ
, 2047, 5};
136 * Rewrote the wait-routines to use the "new" wait-queue functionality,
137 * and getting rid of the cli-sti pairs. The wait-queue routines still
138 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 * Note that the real wait_on_buffer() is an inline function that checks
141 * if 'b_wait' is set before calling this, so that the queues aren't set
144 void __wait_on_buffer(struct buffer_head
* bh
)
146 struct task_struct
*tsk
= current
;
147 DECLARE_WAITQUEUE(wait
, tsk
);
149 atomic_inc(&bh
->b_count
);
150 add_wait_queue(&bh
->b_wait
, &wait
);
152 run_task_queue(&tq_disk
);
153 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
154 if (!buffer_locked(bh
))
157 } while (buffer_locked(bh
));
158 tsk
->state
= TASK_RUNNING
;
159 remove_wait_queue(&bh
->b_wait
, &wait
);
160 atomic_dec(&bh
->b_count
);
163 /* Call sync_buffers with wait!=0 to ensure that the call does not
164 * return until all buffer writes have completed. Sync() may return
165 * before the writes have finished; fsync() may not.
168 /* Godamity-damn. Some buffers (bitmaps for filesystems)
169 * spontaneously dirty themselves without ever brelse being called.
170 * We will ultimately want to put these in a separate list, but for
171 * now we search all of the lists for dirty buffers.
173 static int sync_buffers(kdev_t dev
, int wait
)
175 int i
, retry
, pass
= 0, err
= 0;
176 struct buffer_head
* bh
, *next
;
178 /* One pass for no-wait, three for wait:
179 * 0) write out all dirty, unlocked buffers;
180 * 1) write out all dirty buffers, waiting if locked;
181 * 2) wait for completion by waiting for all buffers to unlock.
186 /* We search all lists as a failsafe mechanism, not because we expect
187 * there to be dirty buffers on any of the other lists.
190 spin_lock(&lru_list_lock
);
191 bh
= lru_list
[BUF_DIRTY
];
195 for (i
= nr_buffers_type
[BUF_DIRTY
]*2 ; i
-- > 0 ; bh
= next
) {
196 next
= bh
->b_next_free
;
198 if (!lru_list
[BUF_DIRTY
])
200 if (dev
&& bh
->b_dev
!= dev
)
202 if (buffer_locked(bh
)) {
203 /* Buffer is locked; skip it unless wait is
204 * requested AND pass > 0.
206 if (!wait
|| !pass
) {
210 atomic_inc(&bh
->b_count
);
211 spin_unlock(&lru_list_lock
);
213 atomic_dec(&bh
->b_count
);
217 /* If an unlocked buffer is not uptodate, there has
218 * been an IO error. Skip it.
220 if (wait
&& buffer_req(bh
) && !buffer_locked(bh
) &&
221 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) {
226 /* Don't write clean buffers. Don't write ANY buffers
229 if (!buffer_dirty(bh
) || pass
>= 2)
232 atomic_inc(&bh
->b_count
);
233 spin_unlock(&lru_list_lock
);
234 ll_rw_block(WRITE
, 1, &bh
);
235 atomic_dec(&bh
->b_count
);
241 bh
= lru_list
[BUF_LOCKED
];
243 spin_unlock(&lru_list_lock
);
246 for (i
= nr_buffers_type
[BUF_LOCKED
]*2 ; i
-- > 0 ; bh
= next
) {
247 next
= bh
->b_next_free
;
249 if (!lru_list
[BUF_LOCKED
])
251 if (dev
&& bh
->b_dev
!= dev
)
253 if (buffer_locked(bh
)) {
254 /* Buffer is locked; skip it unless wait is
255 * requested AND pass > 0.
257 if (!wait
|| !pass
) {
261 atomic_inc(&bh
->b_count
);
262 spin_unlock(&lru_list_lock
);
264 spin_lock(&lru_list_lock
);
265 atomic_dec(&bh
->b_count
);
269 spin_unlock(&lru_list_lock
);
271 /* If we are waiting for the sync to succeed, and if any dirty
272 * blocks were written, then repeat; on the second pass, only
273 * wait for buffers being written (do not pass to write any
274 * more buffers on the second pass).
276 } while (wait
&& retry
&& ++pass
<=2);
280 void sync_dev(kdev_t dev
)
285 /* sync all the dirty buffers out to disk only _after_ all the
286 high level layers finished generated buffer dirty data
287 (or we'll return with some buffer still dirty on the blockdevice
288 so breaking the semantics of this call) */
289 sync_buffers(dev
, 0);
291 * FIXME(eric) we need to sync the physical devices here.
292 * This is because some (scsi) controllers have huge amounts of
293 * cache onboard (hundreds of Mb), and we need to instruct
294 * them to commit all of the dirty memory to disk, and we should
295 * not return until this has happened.
297 * This would need to get implemented by going through the assorted
298 * layers so that each block major number can be synced, and this
299 * would call down into the upper and mid-layer scsi.
303 int fsync_dev(kdev_t dev
)
305 sync_buffers(dev
, 0);
313 return sync_buffers(dev
, 1);
316 asmlinkage
long sys_sync(void)
323 * filp may be NULL if called via the msync of a vma.
326 int file_fsync(struct file
*filp
, struct dentry
*dentry
, int datasync
)
328 struct inode
* inode
= dentry
->d_inode
;
329 struct super_block
* sb
;
334 /* sync the inode to buffers */
335 write_inode_now(inode
, 0);
337 /* sync the superblock to buffers */
340 if (sb
->s_op
&& sb
->s_op
->write_super
)
341 sb
->s_op
->write_super(sb
);
343 /* .. finally sync the buffers to disk */
345 ret
= sync_buffers(dev
, 1);
350 asmlinkage
long sys_fsync(unsigned int fd
)
353 struct dentry
* dentry
;
354 struct inode
* inode
;
362 dentry
= file
->f_dentry
;
363 inode
= dentry
->d_inode
;
366 if (!file
->f_op
|| !file
->f_op
->fsync
)
369 /* We need to protect against concurrent writers.. */
371 err
= file
->f_op
->fsync(file
, dentry
, 0);
380 asmlinkage
long sys_fdatasync(unsigned int fd
)
383 struct dentry
* dentry
;
384 struct inode
* inode
;
392 dentry
= file
->f_dentry
;
393 inode
= dentry
->d_inode
;
396 if (!file
->f_op
|| !file
->f_op
->fsync
)
400 err
= file
->f_op
->fsync(file
, dentry
, 1);
409 /* After several hours of tedious analysis, the following hash
410 * function won. Do not mess with it... -DaveM
412 #define _hashfn(dev,block) \
413 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
414 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
415 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
417 static __inline__
void __hash_link(struct buffer_head
*bh
, struct buffer_head
**head
)
419 if ((bh
->b_next
= *head
) != NULL
)
420 bh
->b_next
->b_pprev
= &bh
->b_next
;
425 static __inline__
void __hash_unlink(struct buffer_head
*bh
)
429 bh
->b_next
->b_pprev
= bh
->b_pprev
;
430 *(bh
->b_pprev
) = bh
->b_next
;
435 static void __insert_into_lru_list(struct buffer_head
* bh
, int blist
)
437 struct buffer_head
**bhp
= &lru_list
[blist
];
441 bh
->b_prev_free
= bh
;
443 bh
->b_next_free
= *bhp
;
444 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
445 (*bhp
)->b_prev_free
->b_next_free
= bh
;
446 (*bhp
)->b_prev_free
= bh
;
447 nr_buffers_type
[blist
]++;
448 size_buffers_type
[blist
] += bh
->b_size
;
451 static void __remove_from_lru_list(struct buffer_head
* bh
, int blist
)
453 if (bh
->b_prev_free
|| bh
->b_next_free
) {
454 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
455 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
456 if (lru_list
[blist
] == bh
)
457 lru_list
[blist
] = bh
->b_next_free
;
458 if (lru_list
[blist
] == bh
)
459 lru_list
[blist
] = NULL
;
460 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
461 nr_buffers_type
[blist
]--;
462 size_buffers_type
[blist
] -= bh
->b_size
;
466 static void __remove_from_free_list(struct buffer_head
* bh
, int index
)
468 if(bh
->b_next_free
== bh
)
469 free_list
[index
].list
= NULL
;
471 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
472 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
473 if (free_list
[index
].list
== bh
)
474 free_list
[index
].list
= bh
->b_next_free
;
476 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
479 /* must be called with both the hash_table_lock and the lru_list_lock
481 static void __remove_from_queues(struct buffer_head
*bh
)
484 __remove_from_lru_list(bh
, bh
->b_list
);
487 static void insert_into_queues(struct buffer_head
*bh
)
489 struct buffer_head
**head
= &hash(bh
->b_dev
, bh
->b_blocknr
);
491 spin_lock(&lru_list_lock
);
492 write_lock(&hash_table_lock
);
493 __hash_link(bh
, head
);
494 __insert_into_lru_list(bh
, bh
->b_list
);
495 write_unlock(&hash_table_lock
);
496 spin_unlock(&lru_list_lock
);
499 /* This function must only run if there are no other
500 * references _anywhere_ to this buffer head.
502 static void put_last_free(struct buffer_head
* bh
)
504 struct bh_free_head
*head
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)];
505 struct buffer_head
**bhp
= &head
->list
;
509 spin_lock(&head
->lock
);
513 bh
->b_prev_free
= bh
;
515 bh
->b_next_free
= *bhp
;
516 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
517 (*bhp
)->b_prev_free
->b_next_free
= bh
;
518 (*bhp
)->b_prev_free
= bh
;
519 spin_unlock(&head
->lock
);
523 * Why like this, I hear you say... The reason is race-conditions.
524 * As we don't lock buffers (unless we are reading them, that is),
525 * something might happen to it while we sleep (ie a read-error
526 * will force it bad). This shouldn't really happen currently, but
529 struct buffer_head
* get_hash_table(kdev_t dev
, int block
, int size
)
531 struct buffer_head
**head
= &hash(dev
, block
);
532 struct buffer_head
*bh
;
534 read_lock(&hash_table_lock
);
535 for(bh
= *head
; bh
; bh
= bh
->b_next
)
536 if (bh
->b_blocknr
== block
&&
537 bh
->b_size
== size
&&
541 atomic_inc(&bh
->b_count
);
542 read_unlock(&hash_table_lock
);
547 unsigned int get_hardblocksize(kdev_t dev
)
550 * Get the hard sector size for the given device. If we don't know
551 * what it is, return 0.
553 if (hardsect_size
[MAJOR(dev
)] != NULL
) {
554 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)];
560 * We don't know what the hardware sector size for this device is.
561 * Return 0 indicating that we don't know.
566 /* If invalidate_buffers() will trash dirty buffers, it means some kind
567 of fs corruption is going on. Trashing dirty data always imply losing
568 information that was supposed to be just stored on the physical layer
571 Thus invalidate_buffers in general usage is not allwowed to trash dirty
572 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
574 NOTE: In the case where the user removed a removable-media-disk even if
575 there's still dirty data not synced on disk (due a bug in the device driver
576 or due an error of the user), by not destroying the dirty buffers we could
577 generate corruption also on the next media inserted, thus a parameter is
578 necessary to handle this case in the most safe way possible (trying
579 to not corrupt also the new disk inserted with the data belonging to
580 the old now corrupted disk). Also for the ramdisk the natural thing
581 to do in order to release the ramdisk memory is to destroy dirty buffers.
583 These are two special cases. Normal usage imply the device driver
584 to issue a sync on the device (without waiting I/O completation) and
585 then an invalidate_buffers call that doesn't trashes dirty buffers. */
586 void __invalidate_buffers(kdev_t dev
, int destroy_dirty_buffers
)
589 struct buffer_head
* bh
, * bh_next
;
593 spin_lock(&lru_list_lock
);
594 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
595 bh
= lru_list
[nlist
];
598 for (i
= nr_buffers_type
[nlist
]; i
> 0 ; bh
= bh_next
, i
--) {
599 bh_next
= bh
->b_next_free
;
600 if (bh
->b_dev
!= dev
)
602 if (buffer_locked(bh
)) {
603 atomic_inc(&bh
->b_count
);
604 spin_unlock(&lru_list_lock
);
607 spin_lock(&lru_list_lock
);
608 atomic_dec(&bh
->b_count
);
611 write_lock(&hash_table_lock
);
612 if (!atomic_read(&bh
->b_count
) &&
613 (destroy_dirty_buffers
|| !buffer_dirty(bh
))) {
614 __remove_from_queues(bh
);
617 write_unlock(&hash_table_lock
);
623 spin_unlock(&lru_list_lock
);
628 void set_blocksize(kdev_t dev
, int size
)
630 extern int *blksize_size
[];
632 struct buffer_head
* bh
, * bh_next
;
634 if (!blksize_size
[MAJOR(dev
)])
637 /* Size must be a power of two, and between 512 and PAGE_SIZE */
638 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
639 panic("Invalid blocksize passed to set_blocksize");
641 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == 0 && size
== BLOCK_SIZE
) {
642 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
645 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
)
647 sync_buffers(dev
, 2);
648 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
652 spin_lock(&lru_list_lock
);
653 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
654 bh
= lru_list
[nlist
];
657 for (i
= nr_buffers_type
[nlist
]; i
> 0 ; bh
= bh_next
, i
--) {
658 bh_next
= bh
->b_next_free
;
659 if (bh
->b_dev
!= dev
|| bh
->b_size
== size
)
661 if (buffer_locked(bh
)) {
662 atomic_inc(&bh
->b_count
);
663 spin_unlock(&lru_list_lock
);
666 spin_lock(&lru_list_lock
);
667 atomic_dec(&bh
->b_count
);
670 write_lock(&hash_table_lock
);
671 if (!atomic_read(&bh
->b_count
)) {
672 if (buffer_dirty(bh
))
674 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
675 kdevname(dev
), bh
->b_blocknr
, bh
->b_size
);
676 __remove_from_queues(bh
);
679 if (atomic_set_buffer_clean(bh
))
681 clear_bit(BH_Uptodate
, &bh
->b_state
);
684 "b_count %d, dev %s, block %lu, from %p\n",
685 atomic_read(&bh
->b_count
), bdevname(bh
->b_dev
),
686 bh
->b_blocknr
, __builtin_return_address(0));
688 write_unlock(&hash_table_lock
);
694 spin_unlock(&lru_list_lock
);
700 * We used to try various strange things. Let's not.
702 static void refill_freelist(int size
)
704 if (!grow_buffers(size
)) {
706 current
->policy
|= SCHED_YIELD
;
711 void init_buffer(struct buffer_head
*bh
, bh_end_io_t
*handler
, void *private)
713 bh
->b_list
= BUF_CLEAN
;
714 bh
->b_end_io
= handler
;
715 bh
->b_private
= private;
718 static void end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
720 mark_buffer_uptodate(bh
, uptodate
);
724 static void end_buffer_io_bad(struct buffer_head
*bh
, int uptodate
)
726 mark_buffer_uptodate(bh
, uptodate
);
731 static void end_buffer_io_async(struct buffer_head
* bh
, int uptodate
)
733 static spinlock_t page_uptodate_lock
= SPIN_LOCK_UNLOCKED
;
735 struct buffer_head
*tmp
;
738 mark_buffer_uptodate(bh
, uptodate
);
740 /* This is a temporary buffer used for page I/O. */
747 * Be _very_ careful from here on. Bad things can happen if
748 * two buffer heads end IO at almost the same time and both
749 * decide that the page is now completely done.
751 * Async buffer_heads are here only as labels for IO, and get
752 * thrown away once the IO for this page is complete. IO is
753 * deemed complete once all buffers have been visited
754 * (b_count==0) and are now unlocked. We must make sure that
755 * only the _last_ buffer that decrements its count is the one
756 * that unlock the page..
758 spin_lock_irqsave(&page_uptodate_lock
, flags
);
760 atomic_dec(&bh
->b_count
);
761 tmp
= bh
->b_this_page
;
763 if (tmp
->b_end_io
== end_buffer_io_async
&& buffer_locked(tmp
))
765 tmp
= tmp
->b_this_page
;
768 /* OK, the async IO on this page is complete. */
769 spin_unlock_irqrestore(&page_uptodate_lock
, flags
);
772 * if none of the buffers had errors then we can set the
775 if (!PageError(page
))
776 SetPageUptodate(page
);
779 * Run the hooks that have to be done when a page I/O has completed.
781 if (PageTestandClearDecrAfter(page
))
782 atomic_dec(&nr_async_pages
);
789 spin_unlock_irqrestore(&page_uptodate_lock
, flags
);
794 * Ok, this is getblk, and it isn't very clear, again to hinder
795 * race-conditions. Most of the code is seldom used, (ie repeating),
796 * so it should be much more efficient than it looks.
798 * The algorithm is changed: hopefully better, and an elusive bug removed.
800 * 14.02.92: changed it to sync dirty buffers a bit: better performance
801 * when the filesystem starts to get full of dirty blocks (I hope).
803 struct buffer_head
* getblk(kdev_t dev
, int block
, int size
)
805 struct buffer_head
* bh
;
809 bh
= get_hash_table(dev
, block
, size
);
813 isize
= BUFSIZE_INDEX(size
);
814 spin_lock(&free_list
[isize
].lock
);
815 bh
= free_list
[isize
].list
;
817 __remove_from_free_list(bh
, isize
);
818 atomic_set(&bh
->b_count
, 1);
820 spin_unlock(&free_list
[isize
].lock
);
823 * OK, FINALLY we know that this buffer is the only one of
824 * its kind, we hold a reference (b_count>0), it is unlocked,
828 init_buffer(bh
, end_buffer_io_sync
, NULL
);
830 bh
->b_blocknr
= block
;
831 bh
->b_state
= 1 << BH_Mapped
;
833 /* Insert the buffer into the regular lists */
834 insert_into_queues(bh
);
841 * If we block while refilling the free list, somebody may
842 * create the buffer first ... search the hashes again.
844 refill_freelist(size
);
848 /* -1 -> no need to flush
850 1 -> sync flush (wait for I/O completation) */
851 static int balance_dirty_state(kdev_t dev
)
853 unsigned long dirty
, tot
, hard_dirty_limit
, soft_dirty_limit
;
855 dirty
= size_buffers_type
[BUF_DIRTY
] >> PAGE_SHIFT
;
856 tot
= nr_free_buffer_pages();
857 tot
-= size_buffers_type
[BUF_PROTECTED
] >> PAGE_SHIFT
;
860 soft_dirty_limit
= tot
* bdf_prm
.b_un
.nfract
;
861 hard_dirty_limit
= soft_dirty_limit
* 2;
863 if (dirty
> soft_dirty_limit
) {
864 if (dirty
> hard_dirty_limit
)
872 * if a new dirty buffer is created we need to balance bdflush.
874 * in the future we might want to make bdflush aware of different
875 * pressures on different devices - thus the (currently unused)
878 void balance_dirty(kdev_t dev
)
880 int state
= balance_dirty_state(dev
);
884 wakeup_bdflush(state
);
887 static __inline__
void __mark_dirty(struct buffer_head
*bh
, int flag
)
889 bh
->b_flushtime
= jiffies
+ (flag
? bdf_prm
.b_un
.age_super
: bdf_prm
.b_un
.age_buffer
);
893 /* atomic version, the user must call balance_dirty() by hand
894 as soon as it become possible to block */
895 void __mark_buffer_dirty(struct buffer_head
*bh
, int flag
)
897 if (!atomic_set_buffer_dirty(bh
))
898 __mark_dirty(bh
, flag
);
901 void mark_buffer_dirty(struct buffer_head
*bh
, int flag
)
903 __mark_buffer_dirty(bh
, flag
);
904 balance_dirty(bh
->b_dev
);
908 * A buffer may need to be moved from one buffer list to another
909 * (e.g. in case it is not shared any more). Handle this.
911 static void __refile_buffer(struct buffer_head
*bh
)
913 int dispose
= BUF_CLEAN
;
914 if (buffer_locked(bh
))
915 dispose
= BUF_LOCKED
;
916 if (buffer_dirty(bh
))
918 if (buffer_protected(bh
))
919 dispose
= BUF_PROTECTED
;
920 if (dispose
!= bh
->b_list
) {
921 __remove_from_lru_list(bh
, bh
->b_list
);
922 bh
->b_list
= dispose
;
923 __insert_into_lru_list(bh
, dispose
);
927 void refile_buffer(struct buffer_head
*bh
)
929 spin_lock(&lru_list_lock
);
931 spin_unlock(&lru_list_lock
);
935 * Release a buffer head
937 void __brelse(struct buffer_head
* buf
)
939 if (atomic_read(&buf
->b_count
)) {
940 atomic_dec(&buf
->b_count
);
943 printk("VFS: brelse: Trying to free free buffer\n");
947 * bforget() is like brelse(), except it puts the buffer on the
948 * free list if it can.. We can NOT free the buffer if:
949 * - there are other users of it
950 * - it is locked and thus can have active IO
952 void __bforget(struct buffer_head
* buf
)
954 /* grab the lru lock here to block bdflush. */
955 spin_lock(&lru_list_lock
);
956 write_lock(&hash_table_lock
);
957 if (!atomic_dec_and_test(&buf
->b_count
) || buffer_locked(buf
))
960 write_unlock(&hash_table_lock
);
961 __remove_from_lru_list(buf
, buf
->b_list
);
962 spin_unlock(&lru_list_lock
);
967 write_unlock(&hash_table_lock
);
968 spin_unlock(&lru_list_lock
);
972 * bread() reads a specified block and returns the buffer that contains
973 * it. It returns NULL if the block was unreadable.
975 struct buffer_head
* bread(kdev_t dev
, int block
, int size
)
977 struct buffer_head
* bh
;
979 bh
= getblk(dev
, block
, size
);
980 if (buffer_uptodate(bh
))
982 ll_rw_block(READ
, 1, &bh
);
984 if (buffer_uptodate(bh
))
991 * Ok, breada can be used as bread, but additionally to mark other
992 * blocks for reading as well. End the argument list with a negative
998 struct buffer_head
* breada(kdev_t dev
, int block
, int bufsize
,
999 unsigned int pos
, unsigned int filesize
)
1001 struct buffer_head
* bhlist
[NBUF
];
1002 unsigned int blocks
;
1003 struct buffer_head
* bh
;
1007 if (pos
>= filesize
)
1013 bh
= getblk(dev
, block
, bufsize
);
1014 index
= BUFSIZE_INDEX(bh
->b_size
);
1016 if (buffer_uptodate(bh
))
1018 else ll_rw_block(READ
, 1, &bh
);
1020 blocks
= (filesize
- pos
) >> (9+index
);
1022 if (blocks
< (read_ahead
[MAJOR(dev
)] >> index
))
1023 blocks
= read_ahead
[MAJOR(dev
)] >> index
;
1027 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1031 for(i
=1; i
<blocks
; i
++) {
1032 bh
= getblk(dev
,block
+i
,bufsize
);
1033 if (buffer_uptodate(bh
)) {
1037 else bhlist
[j
++] = bh
;
1040 /* Request the read for these buffers, and then release them. */
1042 ll_rw_block(READA
, (j
-1), bhlist
+1);
1046 /* Wait for this buffer, and then continue on. */
1049 if (buffer_uptodate(bh
))
1056 * Note: the caller should wake up the buffer_wait list if needed.
1058 static __inline__
void __put_unused_buffer_head(struct buffer_head
* bh
)
1060 if (nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) {
1061 kmem_cache_free(bh_cachep
, bh
);
1064 init_waitqueue_head(&bh
->b_wait
);
1065 nr_unused_buffer_heads
++;
1066 bh
->b_next_free
= unused_list
;
1067 bh
->b_this_page
= NULL
;
1073 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1074 * no-buffer-head deadlock. Return NULL on failure; waiting for
1075 * buffer heads is now handled in create_buffers().
1077 static struct buffer_head
* get_unused_buffer_head(int async
)
1079 struct buffer_head
* bh
;
1081 spin_lock(&unused_list_lock
);
1082 if (nr_unused_buffer_heads
> NR_RESERVED
) {
1084 unused_list
= bh
->b_next_free
;
1085 nr_unused_buffer_heads
--;
1086 spin_unlock(&unused_list_lock
);
1089 spin_unlock(&unused_list_lock
);
1091 /* This is critical. We can't swap out pages to get
1092 * more buffer heads, because the swap-out may need
1093 * more buffer-heads itself. Thus SLAB_BUFFER.
1095 if((bh
= kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) {
1096 memset(bh
, 0, sizeof(*bh
));
1097 init_waitqueue_head(&bh
->b_wait
);
1102 * If we need an async buffer, use the reserved buffer heads.
1105 spin_lock(&unused_list_lock
);
1108 unused_list
= bh
->b_next_free
;
1109 nr_unused_buffer_heads
--;
1110 spin_unlock(&unused_list_lock
);
1113 spin_unlock(&unused_list_lock
);
1117 * (Pending further analysis ...)
1118 * Ordinary (non-async) requests can use a different memory priority
1119 * to free up pages. Any swapping thus generated will use async
1123 (bh
= kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) {
1124 memset(bh
, 0, sizeof(*bh
));
1125 init_waitqueue_head(&bh
->b_wait
);
1133 void set_bh_page (struct buffer_head
*bh
, struct page
*page
, unsigned long offset
)
1136 if (offset
>= PAGE_SIZE
)
1138 if (PageHighMem(page
))
1140 * This catches illegal uses and preserves the offset:
1142 bh
->b_data
= (char *)(0 + offset
);
1144 bh
->b_data
= (char *)(page_address(page
) + offset
);
1148 * Create the appropriate buffers when given a page for data area and
1149 * the size of each buffer.. Use the bh->b_this_page linked list to
1150 * follow the buffers created. Return NULL if unable to create more
1152 * The async flag is used to differentiate async IO (paging, swapping)
1153 * from ordinary buffer allocations, and only async requests are allowed
1154 * to sleep waiting for buffer heads.
1156 static struct buffer_head
* create_buffers(struct page
* page
, unsigned long size
, int async
)
1158 struct buffer_head
*bh
, *head
;
1164 while ((offset
-= size
) >= 0) {
1165 bh
= get_unused_buffer_head(async
);
1169 bh
->b_dev
= B_FREE
; /* Flag as unused */
1170 bh
->b_this_page
= head
;
1174 bh
->b_next_free
= NULL
;
1176 atomic_set(&bh
->b_count
, 0);
1179 set_bh_page(bh
, page
, offset
);
1181 bh
->b_list
= BUF_CLEAN
;
1182 bh
->b_end_io
= end_buffer_io_bad
;
1186 * In case anything failed, we just free everything we got.
1190 spin_lock(&unused_list_lock
);
1193 head
= head
->b_this_page
;
1194 __put_unused_buffer_head(bh
);
1196 spin_unlock(&unused_list_lock
);
1198 /* Wake up any waiters ... */
1199 wake_up(&buffer_wait
);
1203 * Return failure for non-async IO requests. Async IO requests
1204 * are not allowed to fail, so we have to wait until buffer heads
1205 * become available. But we don't want tasks sleeping with
1206 * partially complete buffers, so all were released above.
1211 /* We're _really_ low on memory. Now we just
1212 * wait for old buffer heads to become free due to
1213 * finishing IO. Since this is an async request and
1214 * the reserve list is empty, we're sure there are
1215 * async buffer heads in use.
1217 run_task_queue(&tq_disk
);
1220 * Set our state for sleeping, then check again for buffer heads.
1221 * This ensures we won't miss a wake_up from an interrupt.
1223 wait_event(buffer_wait
, nr_unused_buffer_heads
>= MAX_BUF_PER_PAGE
);
1227 static int create_page_buffers(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
)
1229 struct buffer_head
*head
, *bh
, *tail
;
1232 if (!PageLocked(page
))
1235 * Allocate async buffer heads pointing to this page, just for I/O.
1236 * They don't show up in the buffer hash table, but they *are*
1237 * registered in page->buffers.
1239 head
= create_buffers(page
, size
, 1);
1245 for (bh
= head
; bh
; bh
= bh
->b_this_page
) {
1249 init_buffer(bh
, end_buffer_io_async
, NULL
);
1251 bh
->b_blocknr
= block
;
1253 set_bit(BH_Mapped
, &bh
->b_state
);
1255 tail
->b_this_page
= head
;
1256 page_cache_get(page
);
1257 page
->buffers
= head
;
1261 static void unmap_buffer(struct buffer_head
* bh
)
1263 if (buffer_mapped(bh
)) {
1264 mark_buffer_clean(bh
);
1266 clear_bit(BH_Uptodate
, &bh
->b_state
);
1267 clear_bit(BH_Mapped
, &bh
->b_state
);
1268 clear_bit(BH_Req
, &bh
->b_state
);
1269 clear_bit(BH_New
, &bh
->b_state
);
1274 * We don't have to release all buffers here, but
1275 * we have to be sure that no dirty buffer is left
1276 * and no IO is going on (no buffer is locked), because
1277 * we have truncated the file and are going to free the
1280 int block_flushpage(struct page
*page
, unsigned long offset
)
1282 struct buffer_head
*head
, *bh
, *next
;
1283 unsigned int curr_off
= 0;
1285 if (!PageLocked(page
))
1290 head
= page
->buffers
;
1293 unsigned int next_off
= curr_off
+ bh
->b_size
;
1294 next
= bh
->b_this_page
;
1297 * is this block fully flushed?
1299 if (offset
<= curr_off
)
1301 curr_off
= next_off
;
1303 } while (bh
!= head
);
1306 * subtle. We release buffer-heads only if this is
1307 * the 'final' flushpage. We have invalidated the get_block
1308 * cached value unconditionally, so real IO is not
1311 * If the free doesn't work out, the buffers can be
1312 * left around - they just turn into anonymous buffers
1316 if (!try_to_free_buffers(page
, 0)) {
1317 atomic_inc(&buffermem_pages
);
1325 static void create_empty_buffers(struct page
*page
, struct inode
*inode
, unsigned long blocksize
)
1327 struct buffer_head
*bh
, *head
, *tail
;
1329 head
= create_buffers(page
, blocksize
, 1);
1335 bh
->b_dev
= inode
->i_dev
;
1337 bh
->b_end_io
= end_buffer_io_bad
;
1339 bh
= bh
->b_this_page
;
1341 tail
->b_this_page
= head
;
1342 page
->buffers
= head
;
1343 page_cache_get(page
);
1346 static void unmap_underlying_metadata(struct buffer_head
* bh
)
1348 struct buffer_head
*old_bh
;
1350 old_bh
= get_hash_table(bh
->b_dev
, bh
->b_blocknr
, bh
->b_size
);
1352 unmap_buffer(old_bh
);
1353 /* Here we could run brelse or bforget. We use
1354 bforget because it will try to put the buffer
1361 * block_write_full_page() is SMP-safe - currently it's still
1362 * being called with the kernel lock held, but the code is ready.
1364 static int __block_write_full_page(struct inode
*inode
, struct page
*page
, get_block_t
*get_block
)
1366 int err
, i
, need_balance_dirty
= 0;
1367 unsigned long block
;
1368 struct buffer_head
*bh
, *head
;
1370 if (!PageLocked(page
))
1374 create_empty_buffers(page
, inode
, inode
->i_sb
->s_blocksize
);
1375 head
= page
->buffers
;
1377 block
= page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_sb
->s_blocksize_bits
);
1383 * If the buffer isn't up-to-date, we can't be sure
1384 * that the buffer has been initialized with the proper
1385 * block number information etc..
1387 * Leave it to the low-level FS to make all those
1388 * decisions (block #0 may actually be a valid block)
1390 bh
->b_end_io
= end_buffer_io_sync
;
1391 if (!buffer_mapped(bh
)) {
1392 err
= get_block(inode
, block
, bh
, 1);
1396 unmap_underlying_metadata(bh
);
1398 set_bit(BH_Uptodate
, &bh
->b_state
);
1399 if (!atomic_set_buffer_dirty(bh
)) {
1400 __mark_dirty(bh
, 0);
1401 need_balance_dirty
= 1;
1404 bh
= bh
->b_this_page
;
1406 } while (bh
!= head
);
1408 if (need_balance_dirty
)
1409 balance_dirty(bh
->b_dev
);
1411 SetPageUptodate(page
);
1414 ClearPageUptodate(page
);
1418 static int __block_prepare_write(struct inode
*inode
, struct page
*page
,
1419 unsigned from
, unsigned to
, get_block_t
*get_block
)
1421 unsigned block_start
, block_end
;
1422 unsigned long block
;
1424 unsigned blocksize
, bbits
;
1425 struct buffer_head
*bh
, *head
, *wait
[2], **wait_bh
=wait
;
1426 char *kaddr
= (char *)kmap(page
);
1428 blocksize
= inode
->i_sb
->s_blocksize
;
1430 create_empty_buffers(page
, inode
, blocksize
);
1431 head
= page
->buffers
;
1433 bbits
= inode
->i_sb
->s_blocksize_bits
;
1434 block
= page
->index
<< (PAGE_CACHE_SHIFT
- bbits
);
1436 for(bh
= head
, block_start
= 0; bh
!= head
|| !block_start
;
1437 block
++, block_start
=block_end
, bh
= bh
->b_this_page
) {
1440 block_end
= block_start
+blocksize
;
1441 if (block_end
<= from
)
1443 if (block_start
>= to
)
1445 bh
->b_end_io
= end_buffer_io_sync
;
1446 if (!buffer_mapped(bh
)) {
1447 err
= get_block(inode
, block
, bh
, 1);
1450 if (buffer_new(bh
)) {
1451 unmap_underlying_metadata(bh
);
1453 memset(kaddr
+to
, 0, block_end
-to
);
1454 if (block_start
< from
)
1455 memset(kaddr
+block_start
, 0, from
-block_start
);
1459 if (!buffer_uptodate(bh
) &&
1460 (block_start
< from
|| block_end
> to
)) {
1461 ll_rw_block(READ
, 1, &bh
);
1466 * If we issued read requests - let them complete.
1468 while(wait_bh
> wait
) {
1469 wait_on_buffer(*--wait_bh
);
1471 if (!buffer_uptodate(*wait_bh
))
1479 static int __block_commit_write(struct inode
*inode
, struct page
*page
,
1480 unsigned from
, unsigned to
)
1482 unsigned block_start
, block_end
;
1483 int partial
= 0, need_balance_dirty
= 0;
1485 struct buffer_head
*bh
, *head
;
1487 blocksize
= inode
->i_sb
->s_blocksize
;
1489 for(bh
= head
= page
->buffers
, block_start
= 0;
1490 bh
!= head
|| !block_start
;
1491 block_start
=block_end
, bh
= bh
->b_this_page
) {
1492 block_end
= block_start
+ blocksize
;
1493 if (block_end
<= from
|| block_start
>= to
) {
1494 if (!buffer_uptodate(bh
))
1497 set_bit(BH_Uptodate
, &bh
->b_state
);
1498 if (!atomic_set_buffer_dirty(bh
)) {
1499 __mark_dirty(bh
, 0);
1500 need_balance_dirty
= 1;
1505 if (need_balance_dirty
)
1506 balance_dirty(bh
->b_dev
);
1508 * is this a partial write that happened to make all buffers
1509 * uptodate then we can optimize away a bogus readpage() for
1510 * the next read(). Here we 'discover' wether the page went
1511 * uptodate as a result of this (potentially partial) write.
1514 SetPageUptodate(page
);
1519 * Generic "read page" function for block devices that have the normal
1520 * get_block functionality. This is most of the block device filesystems.
1521 * Reads the page asynchronously --- the unlock_buffer() and
1522 * mark_buffer_uptodate() functions propagate buffer state into the
1523 * page struct once IO has completed.
1525 int block_read_full_page(struct page
*page
, get_block_t
*get_block
)
1527 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1528 unsigned long iblock
, lblock
;
1529 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
];
1530 unsigned int blocksize
, blocks
;
1531 unsigned long kaddr
= 0;
1534 if (!PageLocked(page
))
1536 blocksize
= inode
->i_sb
->s_blocksize
;
1538 create_empty_buffers(page
, inode
, blocksize
);
1539 head
= page
->buffers
;
1541 blocks
= PAGE_CACHE_SIZE
>> inode
->i_sb
->s_blocksize_bits
;
1542 iblock
= page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_sb
->s_blocksize_bits
);
1543 lblock
= (inode
->i_size
+blocksize
-1) >> inode
->i_sb
->s_blocksize_bits
;
1549 if (buffer_uptodate(bh
))
1552 if (!buffer_mapped(bh
)) {
1553 if (iblock
< lblock
)
1554 get_block(inode
, iblock
, bh
, 0);
1555 if (!buffer_mapped(bh
)) {
1558 memset((char *)(kaddr
+ i
*blocksize
), 0, blocksize
);
1559 set_bit(BH_Uptodate
, &bh
->b_state
);
1564 init_buffer(bh
, end_buffer_io_async
, NULL
);
1565 atomic_inc(&bh
->b_count
);
1568 } while (i
++, iblock
++, (bh
= bh
->b_this_page
) != head
);
1571 if (Page_Uptodate(page
))
1573 ll_rw_block(READ
, nr
, arr
);
1576 * all buffers are uptodate - we can set the page
1579 SetPageUptodate(page
);
1588 * For moronic filesystems that do not allow holes in file.
1589 * We may have to extend the file.
1592 int cont_prepare_write(struct page
*page
, unsigned offset
, unsigned to
, get_block_t
*get_block
, unsigned long *bytes
)
1594 struct address_space
*mapping
= page
->mapping
;
1595 struct inode
*inode
= (struct inode
*)mapping
->host
;
1596 struct page
*new_page
;
1597 unsigned long pgpos
;
1600 unsigned blocksize
= inode
->i_sb
->s_blocksize
;
1603 while(page
->index
> (pgpos
= *bytes
>>PAGE_CACHE_SHIFT
)) {
1605 new_page
= grab_cache_page(mapping
, pgpos
);
1608 /* we might sleep */
1609 if (*bytes
>>PAGE_CACHE_SHIFT
!= pgpos
) {
1610 UnlockPage(new_page
);
1611 page_cache_release(new_page
);
1614 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
;
1615 if (zerofrom
& (blocksize
-1)) {
1616 *bytes
|= (blocksize
-1);
1619 status
= __block_prepare_write(inode
, new_page
, zerofrom
,
1620 PAGE_CACHE_SIZE
, get_block
);
1623 kaddr
= (char*)page_address(new_page
);
1624 memset(kaddr
+zerofrom
, 0, PAGE_CACHE_SIZE
-zerofrom
);
1625 __block_commit_write(inode
, new_page
, zerofrom
, PAGE_CACHE_SIZE
);
1627 UnlockPage(new_page
);
1628 page_cache_release(new_page
);
1631 if (page
->index
< pgpos
) {
1632 /* completely inside the area */
1635 /* page covers the boundary, find the boundary offset */
1636 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
;
1638 /* if we will expand the thing last block will be filled */
1639 if (to
> zerofrom
&& (zerofrom
& (blocksize
-1))) {
1640 *bytes
|= (blocksize
-1);
1644 /* starting below the boundary? Nothing to zero out */
1645 if (offset
<= zerofrom
)
1648 status
= __block_prepare_write(inode
, page
, zerofrom
, to
, get_block
);
1651 kaddr
= (char*)page_address(page
);
1652 if (zerofrom
< offset
) {
1653 memset(kaddr
+zerofrom
, 0, offset
-zerofrom
);
1654 __block_commit_write(inode
, page
, zerofrom
, offset
);
1658 ClearPageUptodate(page
);
1663 ClearPageUptodate(new_page
);
1665 UnlockPage(new_page
);
1666 page_cache_release(new_page
);
1671 int block_prepare_write(struct page
*page
, unsigned from
, unsigned to
,
1672 get_block_t
*get_block
)
1674 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1675 int err
= __block_prepare_write(inode
, page
, from
, to
, get_block
);
1677 ClearPageUptodate(page
);
1683 int generic_commit_write(struct file
*file
, struct page
*page
,
1684 unsigned from
, unsigned to
)
1686 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1687 loff_t pos
= ((loff_t
)page
->index
<< PAGE_CACHE_SHIFT
) + to
;
1688 __block_commit_write(inode
,page
,from
,to
);
1690 if (pos
> inode
->i_size
)
1691 inode
->i_size
= pos
;
1695 int block_write_full_page(struct page
*page
, get_block_t
*get_block
)
1697 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1698 unsigned long end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
1703 if (page
->index
< end_index
)
1704 return __block_write_full_page(inode
, page
, get_block
);
1706 /* things got complicated... */
1707 offset
= inode
->i_size
& (PAGE_CACHE_SIZE
-1);
1708 /* OK, are we completely out? */
1709 if (page
->index
>= end_index
+1 || !offset
)
1711 /* Sigh... will have to work, then... */
1712 err
= __block_prepare_write(inode
, page
, 0, offset
, get_block
);
1714 memset((char *)page_address(page
)+offset
, 0, PAGE_CACHE_SIZE
-offset
);
1715 __block_commit_write(inode
,page
,0,offset
);
1720 ClearPageUptodate(page
);
1724 int generic_block_bmap(struct address_space
*mapping
, long block
, get_block_t
*get_block
)
1726 struct buffer_head tmp
;
1727 struct inode
*inode
= (struct inode
*)mapping
->host
;
1730 get_block(inode
, block
, &tmp
, 0);
1731 return tmp
.b_blocknr
;
1735 * IO completion routine for a buffer_head being used for kiobuf IO: we
1736 * can't dispatch the kiobuf callback until io_count reaches 0.
1739 static void end_buffer_io_kiobuf(struct buffer_head
*bh
, int uptodate
)
1741 struct kiobuf
*kiobuf
;
1743 mark_buffer_uptodate(bh
, uptodate
);
1745 kiobuf
= bh
->b_private
;
1747 end_kio_request(kiobuf
, uptodate
);
1752 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1753 * for them to complete. Clean up the buffer_heads afterwards.
1756 static int do_kio(int rw
, int nr
, struct buffer_head
*bh
[], int size
)
1760 struct buffer_head
*tmp
;
1764 ll_rw_block(rw
, nr
, bh
);
1767 spin_lock(&unused_list_lock
);
1769 for (i
= nr
; --i
>= 0; ) {
1772 if (buffer_locked(tmp
)) {
1773 spin_unlock(&unused_list_lock
);
1774 wait_on_buffer(tmp
);
1775 spin_lock(&unused_list_lock
);
1778 if (!buffer_uptodate(tmp
)) {
1779 /* We are traversing bh'es in reverse order so
1780 clearing iosize on error calculates the
1781 amount of IO before the first error. */
1784 __put_unused_buffer_head(tmp
);
1787 spin_unlock(&unused_list_lock
);
1793 * Start I/O on a physical range of kernel memory, defined by a vector
1794 * of kiobuf structs (much like a user-space iovec list).
1796 * The kiobuf must already be locked for IO. IO is submitted
1797 * asynchronously: you need to check page->locked, page->uptodate, and
1798 * maybe wait on page->wait.
1800 * It is up to the caller to make sure that there are enough blocks
1801 * passed in to completely map the iobufs to disk.
1804 int brw_kiovec(int rw
, int nr
, struct kiobuf
*iovec
[],
1805 kdev_t dev
, unsigned long b
[], int size
)
1815 unsigned long blocknr
;
1816 struct kiobuf
* iobuf
= NULL
;
1818 struct buffer_head
*tmp
, *bh
[KIO_MAX_SECTORS
];
1824 * First, do some alignment and validity checks
1826 for (i
= 0; i
< nr
; i
++) {
1828 if ((iobuf
->offset
& (size
-1)) ||
1829 (iobuf
->length
& (size
-1)))
1831 if (!iobuf
->nr_pages
)
1832 panic("brw_kiovec: iobuf not initialised");
1836 * OK to walk down the iovec doing page IO on each page we find.
1838 bufind
= bhind
= transferred
= err
= 0;
1839 for (i
= 0; i
< nr
; i
++) {
1841 offset
= iobuf
->offset
;
1842 length
= iobuf
->length
;
1845 for (pageind
= 0; pageind
< iobuf
->nr_pages
; pageind
++) {
1846 map
= iobuf
->maplist
[pageind
];
1852 while (length
> 0) {
1853 blocknr
= b
[bufind
++];
1854 tmp
= get_unused_buffer_head(0);
1860 tmp
->b_dev
= B_FREE
;
1862 set_bh_page(tmp
, map
, offset
);
1863 tmp
->b_this_page
= tmp
;
1865 init_buffer(tmp
, end_buffer_io_kiobuf
, iobuf
);
1867 tmp
->b_blocknr
= blocknr
;
1868 tmp
->b_state
= 1 << BH_Mapped
;
1871 set_bit(BH_Uptodate
, &tmp
->b_state
);
1872 set_bit(BH_Dirty
, &tmp
->b_state
);
1879 atomic_inc(&iobuf
->io_count
);
1882 * Start the IO if we have got too much
1884 if (bhind
>= KIO_MAX_SECTORS
) {
1885 err
= do_kio(rw
, bhind
, bh
, size
);
1893 if (offset
>= PAGE_SIZE
) {
1897 } /* End of block loop */
1898 } /* End of page loop */
1899 } /* End of iovec loop */
1901 /* Is there any IO still left to submit? */
1903 err
= do_kio(rw
, bhind
, bh
, size
);
1916 /* We got an error allocating the bh'es. Just free the current
1917 buffer_heads and exit. */
1918 spin_lock(&unused_list_lock
);
1919 for (i
= bhind
; --i
>= 0; ) {
1920 __put_unused_buffer_head(bh
[bhind
]);
1922 spin_unlock(&unused_list_lock
);
1927 * Start I/O on a page.
1928 * This function expects the page to be locked and may return
1929 * before I/O is complete. You then have to check page->locked,
1930 * page->uptodate, and maybe wait on page->wait.
1932 * brw_page() is SMP-safe, although it's being called with the
1933 * kernel lock held - but the code is ready.
1935 * FIXME: we need a swapper_inode->get_block function to remove
1936 * some of the bmap kludges and interface ugliness here.
1938 int brw_page(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
)
1940 struct buffer_head
*head
, *bh
, *arr
[MAX_BUF_PER_PAGE
];
1941 int nr
, fresh
/* temporary debugging flag */, block
;
1943 if (!PageLocked(page
))
1944 panic("brw_page: page not locked for I/O");
1945 // ClearPageError(page);
1947 * We pretty much rely on the page lock for this, because
1948 * create_page_buffers() might sleep.
1951 if (!page
->buffers
) {
1952 create_page_buffers(rw
, page
, dev
, b
, size
);
1958 head
= page
->buffers
;
1964 if (fresh
&& (atomic_read(&bh
->b_count
) != 0))
1969 if (!buffer_uptodate(bh
)) {
1971 atomic_inc(&bh
->b_count
);
1973 } else { /* WRITE */
1974 if (!bh
->b_blocknr
) {
1977 bh
->b_blocknr
= block
;
1982 set_bit(BH_Uptodate
, &bh
->b_state
);
1983 set_bit(BH_Dirty
, &bh
->b_state
);
1985 atomic_inc(&bh
->b_count
);
1987 bh
= bh
->b_this_page
;
1988 } while (bh
!= head
);
1989 if ((rw
== READ
) && nr
) {
1990 if (Page_Uptodate(page
))
1992 ll_rw_block(rw
, nr
, arr
);
1994 if (!nr
&& rw
== READ
) {
1995 SetPageUptodate(page
);
1998 if (nr
&& (rw
== WRITE
))
1999 ll_rw_block(rw
, nr
, arr
);
2004 int block_symlink(struct inode
*inode
, const char *symname
, int len
)
2006 struct address_space
*mapping
= inode
->i_mapping
;
2007 struct page
*page
= grab_cache_page(mapping
, 0);
2013 err
= mapping
->a_ops
->prepare_write(NULL
, page
, 0, len
-1);
2016 kaddr
= (char*)page_address(page
);
2017 memcpy(kaddr
, symname
, len
-1);
2018 mapping
->a_ops
->commit_write(NULL
, page
, 0, len
-1);
2020 * Notice that we are _not_ going to block here - end of page is
2021 * unmapped, so this will only try to map the rest of page, see
2022 * that it is unmapped (typically even will not look into inode -
2023 * ->i_size will be enough for everything) and zero it out.
2024 * OTOH it's obviously correct and should make the page up-to-date.
2026 err
= mapping
->a_ops
->readpage(NULL
, page
);
2028 page_cache_release(page
);
2031 mark_inode_dirty(inode
);
2035 page_cache_release(page
);
2041 * Try to increase the number of buffers available: the size argument
2042 * is used to determine what kind of buffers we want.
2044 static int grow_buffers(int size
)
2047 struct buffer_head
*bh
, *tmp
;
2048 struct buffer_head
* insert_point
;
2051 if ((size
& 511) || (size
> PAGE_SIZE
)) {
2052 printk("VFS: grow_buffers: size = %d\n",size
);
2056 page
= alloc_page(GFP_BUFFER
);
2059 bh
= create_buffers(page
, size
, 0);
2061 goto no_buffer_head
;
2063 isize
= BUFSIZE_INDEX(size
);
2065 spin_lock(&free_list
[isize
].lock
);
2066 insert_point
= free_list
[isize
].list
;
2070 tmp
->b_next_free
= insert_point
->b_next_free
;
2071 tmp
->b_prev_free
= insert_point
;
2072 insert_point
->b_next_free
->b_prev_free
= tmp
;
2073 insert_point
->b_next_free
= tmp
;
2075 tmp
->b_prev_free
= tmp
;
2076 tmp
->b_next_free
= tmp
;
2079 if (tmp
->b_this_page
)
2080 tmp
= tmp
->b_this_page
;
2084 tmp
->b_this_page
= bh
;
2085 free_list
[isize
].list
= bh
;
2086 spin_unlock(&free_list
[isize
].lock
);
2089 page
->flags
&= ~(1 << PG_referenced
);
2090 lru_cache_add(page
);
2091 atomic_inc(&buffermem_pages
);
2095 page_cache_release(page
);
2101 * Sync all the buffers on one page..
2103 * If we have old buffers that are locked, we'll
2104 * wait on them, but we won't wait on the new ones
2105 * we're writing out now.
2107 * This all is required so that we can free up memory
2110 static void sync_page_buffers(struct buffer_head
*bh
, int wait
)
2112 struct buffer_head
* tmp
= bh
;
2115 struct buffer_head
*p
= tmp
;
2116 tmp
= tmp
->b_this_page
;
2117 if (buffer_locked(p
)) {
2119 __wait_on_buffer(p
);
2120 } else if (buffer_dirty(p
))
2121 ll_rw_block(WRITE
, 1, &p
);
2122 } while (tmp
!= bh
);
2126 * Can the buffer be thrown out?
2128 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2129 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2132 * try_to_free_buffers() checks if all the buffers on this particular page
2133 * are unused, and free's the page if so.
2135 * Wake up bdflush() if this fails - if we're running low on memory due
2136 * to dirty buffers, we need to flush them out as quickly as possible.
2138 * NOTE: There are quite a number of ways that threads of control can
2139 * obtain a reference to a buffer head within a page. So we must
2140 * lock out all of these paths to cleanly toss the page.
2142 int try_to_free_buffers(struct page
* page
, int wait
)
2144 struct buffer_head
* tmp
, * bh
= page
->buffers
;
2145 int index
= BUFSIZE_INDEX(bh
->b_size
);
2147 spin_lock(&lru_list_lock
);
2148 write_lock(&hash_table_lock
);
2149 spin_lock(&free_list
[index
].lock
);
2152 struct buffer_head
*p
= tmp
;
2154 tmp
= tmp
->b_this_page
;
2156 goto busy_buffer_page
;
2157 } while (tmp
!= bh
);
2159 spin_lock(&unused_list_lock
);
2162 struct buffer_head
* p
= tmp
;
2163 tmp
= tmp
->b_this_page
;
2165 /* The buffer can be either on the regular
2166 * queues or on the free list..
2168 if (p
->b_dev
!= B_FREE
)
2169 __remove_from_queues(p
);
2171 __remove_from_free_list(p
, index
);
2172 __put_unused_buffer_head(p
);
2173 } while (tmp
!= bh
);
2174 spin_unlock(&unused_list_lock
);
2176 /* Wake up anyone waiting for buffer heads */
2177 wake_up(&buffer_wait
);
2179 /* And free the page */
2180 page
->buffers
= NULL
;
2181 page_cache_release(page
);
2182 spin_unlock(&free_list
[index
].lock
);
2183 write_unlock(&hash_table_lock
);
2184 spin_unlock(&lru_list_lock
);
2188 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2189 spin_unlock(&free_list
[index
].lock
);
2190 write_unlock(&hash_table_lock
);
2191 spin_unlock(&lru_list_lock
);
2192 sync_page_buffers(bh
, wait
);
2196 /* ================== Debugging =================== */
2198 void show_buffers(void)
2201 struct buffer_head
* bh
;
2202 int found
= 0, locked
= 0, dirty
= 0, used
= 0, lastused
= 0;
2205 static char *buf_types
[NR_LIST
] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2208 printk("Buffer memory: %6dkB\n",
2209 atomic_read(&buffermem_pages
) << (PAGE_SHIFT
-10));
2211 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2212 if (!spin_trylock(&lru_list_lock
))
2214 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
2215 found
= locked
= dirty
= used
= lastused
= protected = 0;
2216 bh
= lru_list
[nlist
];
2221 if (buffer_locked(bh
))
2223 if (buffer_protected(bh
))
2225 if (buffer_dirty(bh
))
2227 if (atomic_read(&bh
->b_count
))
2228 used
++, lastused
= found
;
2229 bh
= bh
->b_next_free
;
2230 } while (bh
!= lru_list
[nlist
]);
2232 int tmp
= nr_buffers_type
[nlist
];
2234 printk("%9s: BUG -> found %d, reported %d\n",
2235 buf_types
[nlist
], found
, tmp
);
2237 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2238 "%d locked, %d protected, %d dirty\n",
2239 buf_types
[nlist
], found
, size_buffers_type
[nlist
]>>10,
2240 used
, lastused
, locked
, protected, dirty
);
2242 spin_unlock(&lru_list_lock
);
2246 /* ===================== Init ======================= */
2249 * allocate the hash table and init the free list
2250 * Use gfp() for the hash table to decrease TLB misses, use
2251 * SLAB cache for buffer heads.
2253 void __init
buffer_init(unsigned long mempages
)
2256 unsigned int nr_hash
;
2258 /* The buffer cache hash table is less important these days,
2263 mempages
*= sizeof(struct buffer_head
*);
2265 for (order
= 0; (1 << order
) < mempages
; order
++)
2268 /* try to allocate something until we get it or we're asking
2269 for something that is really too small */
2274 nr_hash
= (PAGE_SIZE
<< order
) / sizeof(struct buffer_head
*);
2275 bh_hash_mask
= (nr_hash
- 1);
2279 while((tmp
>>= 1UL) != 0UL)
2282 hash_table
= (struct buffer_head
**)
2283 __get_free_pages(GFP_ATOMIC
, order
);
2284 } while (hash_table
== NULL
&& --order
> 0);
2285 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2286 nr_hash
, order
, (PAGE_SIZE
<< order
));
2289 panic("Failed to allocate buffer hash table\n");
2291 /* Setup hash chains. */
2292 for(i
= 0; i
< nr_hash
; i
++)
2293 hash_table
[i
] = NULL
;
2295 /* Setup free lists. */
2296 for(i
= 0; i
< NR_SIZES
; i
++) {
2297 free_list
[i
].list
= NULL
;
2298 free_list
[i
].lock
= SPIN_LOCK_UNLOCKED
;
2301 /* Setup lru lists. */
2302 for(i
= 0; i
< NR_LIST
; i
++)
2305 bh_cachep
= kmem_cache_create("buffer_head",
2306 sizeof(struct buffer_head
),
2308 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
2310 panic("Cannot create buffer head SLAB cache\n");
2314 /* ====================== bdflush support =================== */
2316 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2317 * response to dirty buffers. Once this process is activated, we write back
2318 * a limited number of buffers to the disks and then go back to sleep again.
2320 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done
);
2321 struct task_struct
*bdflush_tsk
= 0;
2323 void wakeup_bdflush(int block
)
2325 DECLARE_WAITQUEUE(wait
, current
);
2327 if (current
== bdflush_tsk
)
2331 wake_up_process(bdflush_tsk
);
2335 /* kflushd can wakeup us before we have a chance to
2336 go to sleep so we must be smart in handling
2337 this wakeup event from kflushd to avoid deadlocking in SMP
2338 (we are not holding any lock anymore in these two paths). */
2339 __set_current_state(TASK_UNINTERRUPTIBLE
);
2340 add_wait_queue(&bdflush_done
, &wait
);
2342 wake_up_process(bdflush_tsk
);
2345 remove_wait_queue(&bdflush_done
, &wait
);
2346 __set_current_state(TASK_RUNNING
);
2349 /* This is the _only_ function that deals with flushing async writes
2351 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2352 as all dirty buffers lives _only_ in the DIRTY lru list.
2353 As we never browse the LOCKED and CLEAN lru lists they are infact
2354 completly useless. */
2355 static int flush_dirty_buffers(int check_flushtime
)
2357 struct buffer_head
* bh
, *next
;
2361 spin_lock(&lru_list_lock
);
2362 bh
= lru_list
[BUF_DIRTY
];
2365 for (i
= nr_buffers_type
[BUF_DIRTY
]; i
-- > 0; bh
= next
) {
2366 next
= bh
->b_next_free
;
2368 if (!buffer_dirty(bh
)) {
2369 __refile_buffer(bh
);
2372 if (buffer_locked(bh
))
2375 if (check_flushtime
) {
2376 /* The dirty lru list is chronologically ordered so
2377 if the current bh is not yet timed out,
2378 then also all the following bhs
2379 will be too young. */
2380 if (time_before(jiffies
, bh
->b_flushtime
))
2383 if (++flushed
> bdf_prm
.b_un
.ndirty
)
2387 /* OK, now we are committed to write it out. */
2388 atomic_inc(&bh
->b_count
);
2389 spin_unlock(&lru_list_lock
);
2390 ll_rw_block(WRITE
, 1, &bh
);
2391 atomic_dec(&bh
->b_count
);
2393 if (current
->need_resched
)
2398 spin_unlock(&lru_list_lock
);
2404 * Here we attempt to write back old buffers. We also try to flush inodes
2405 * and supers as well, since this function is essentially "update", and
2406 * otherwise there would be no way of ensuring that these quantities ever
2407 * get written back. Ideally, we would have a timestamp on the inodes
2408 * and superblocks so that we could write back only the old ones as well
2411 static int sync_old_buffers(void)
2418 flush_dirty_buffers(1);
2419 /* must really sync all the active I/O request to disk here */
2420 run_task_queue(&tq_disk
);
2424 int block_sync_page(struct page
*page
)
2426 run_task_queue(&tq_disk
);
2430 /* This is the interface to bdflush. As we get more sophisticated, we can
2431 * pass tuning parameters to this "process", to adjust how it behaves.
2432 * We would want to verify each parameter, however, to make sure that it
2435 asmlinkage
long sys_bdflush(int func
, long data
)
2437 if (!capable(CAP_SYS_ADMIN
))
2441 /* do_exit directly and let kupdate to do its work alone. */
2443 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2444 a syscall that doesn't care about the current mm context. */
2446 struct mm_struct
*user_mm
;
2449 * bdflush will spend all of it's time in kernel-space,
2450 * without touching user-space, so we can switch it into
2451 * 'lazy TLB mode' to reduce the cost of context-switches
2452 * to and from bdflush.
2454 user_mm
= start_lazy_tlb();
2455 error
= sync_old_buffers();
2456 end_lazy_tlb(user_mm
);
2461 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2463 int i
= (func
-2) >> 1;
2464 if (i
>= 0 && i
< N_PARAM
) {
2465 if ((func
& 1) == 0)
2466 return put_user(bdf_prm
.data
[i
], (int*)data
);
2468 if (data
>= bdflush_min
[i
] && data
<= bdflush_max
[i
]) {
2469 bdf_prm
.data
[i
] = data
;
2476 /* Having func 0 used to launch the actual bdflush and then never
2477 * return (unless explicitly killed). We return zero here to
2478 * remain semi-compatible with present update(8) programs.
2484 * This is the actual bdflush daemon itself. It used to be started from
2485 * the syscall above, but now we launch it ourselves internally with
2486 * kernel_thread(...) directly after the first thread in init/main.c
2488 int bdflush(void *sem
)
2490 struct task_struct
*tsk
= current
;
2493 * We have a bare-bones task_struct, and really should fill
2494 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2495 * display semi-sane things. Not real crucial though...
2500 strcpy(tsk
->comm
, "kflushd");
2503 /* avoid getting signals */
2504 spin_lock_irq(&tsk
->sigmask_lock
);
2506 sigfillset(&tsk
->blocked
);
2507 recalc_sigpending(tsk
);
2508 spin_unlock_irq(&tsk
->sigmask_lock
);
2510 up((struct semaphore
*)sem
);
2513 CHECK_EMERGENCY_SYNC
2515 flushed
= flush_dirty_buffers(0);
2517 /* If wakeup_bdflush will wakeup us
2518 after our bdflush_done wakeup, then
2519 we must make sure to not sleep
2520 in schedule_timeout otherwise
2521 wakeup_bdflush may wait for our
2522 bdflush_done wakeup that would never arrive
2523 (as we would be sleeping) and so it would
2525 __set_current_state(TASK_INTERRUPTIBLE
);
2526 wake_up(&bdflush_done
);
2528 * If there are still a lot of dirty buffers around,
2529 * skip the sleep and flush some more. Otherwise, we
2530 * go to sleep waiting a wakeup.
2532 if (!flushed
|| balance_dirty_state(NODEV
) < 0)
2534 /* Remember to mark us as running otherwise
2535 the next schedule will block. */
2536 __set_current_state(TASK_RUNNING
);
2541 * This is the kernel update daemon. It was used to live in userspace
2542 * but since it's need to run safely we want it unkillable by mistake.
2543 * You don't need to change your userspace configuration since
2544 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2546 int kupdate(void *sem
)
2548 struct task_struct
* tsk
= current
;
2553 strcpy(tsk
->comm
, "kupdate");
2555 /* sigstop and sigcont will stop and wakeup kupdate */
2556 spin_lock_irq(&tsk
->sigmask_lock
);
2557 sigfillset(&tsk
->blocked
);
2558 siginitsetinv(¤t
->blocked
, sigmask(SIGCONT
) | sigmask(SIGSTOP
));
2559 recalc_sigpending(tsk
);
2560 spin_unlock_irq(&tsk
->sigmask_lock
);
2562 up((struct semaphore
*)sem
);
2565 /* update interval */
2566 interval
= bdf_prm
.b_un
.interval
;
2568 tsk
->state
= TASK_INTERRUPTIBLE
;
2569 schedule_timeout(interval
);
2572 tsk
->state
= TASK_STOPPED
;
2573 schedule(); /* wait for SIGCONT */
2575 /* check for sigstop */
2576 if (signal_pending(tsk
)) {
2578 spin_lock_irq(&tsk
->sigmask_lock
);
2579 if (sigismember(&tsk
->signal
, SIGSTOP
)) {
2580 sigdelset(&tsk
->signal
, SIGSTOP
);
2583 recalc_sigpending(tsk
);
2584 spin_unlock_irq(&tsk
->sigmask_lock
);
2589 printk("kupdate() activated...\n");
2595 static int __init
bdflush_init(void)
2597 DECLARE_MUTEX_LOCKED(sem
);
2598 kernel_thread(bdflush
, &sem
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
);
2600 kernel_thread(kupdate
, &sem
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
);
2605 module_init(bdflush_init
)