4 * Copyright (C) 1991, 1992 Linus Torvalds
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
54 static char buffersize_index
[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
74 static unsigned int bh_hash_mask
;
75 static unsigned int bh_hash_shift
;
76 static struct buffer_head
**hash_table
;
77 static rwlock_t hash_table_lock
= RW_LOCK_UNLOCKED
;
79 static struct buffer_head
*lru_list
[NR_LIST
];
80 static spinlock_t lru_list_lock
= SPIN_LOCK_UNLOCKED
;
81 static int nr_buffers_type
[NR_LIST
];
82 static unsigned long size_buffers_type
[NR_LIST
];
84 static struct buffer_head
* unused_list
;
85 static int nr_unused_buffer_heads
;
86 static spinlock_t unused_list_lock
= SPIN_LOCK_UNLOCKED
;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait
);
90 struct buffer_head
*list
;
93 static struct bh_free_head free_list
[NR_SIZES
];
95 kmem_cache_t
*bh_cachep
;
97 static int grow_buffers(int size
);
98 static void __refile_buffer(struct buffer_head
*);
100 /* This is used by some architectures to estimate available memory. */
101 atomic_t buffermem_pages
= ATOMIC_INIT(0);
103 /* Here is the parameter block for the bdflush process. If you add or
104 * remove any of the parameters, make sure to update kernel/sysctl.c.
109 /* The dummy values in this structure are left in there for compatibility
110 * with old programs that play with the /proc entries.
112 union bdflush_param
{
114 int nfract
; /* Percentage of buffer cache dirty to
116 int ndirty
; /* Maximum number of dirty blocks to write out per
118 int nrefill
; /* Number of clean buffers to try to obtain
119 each time we call refill */
120 int nref_dirt
; /* Dirty buffer threshold for activating bdflush
121 when trying to refill buffers. */
122 int interval
; /* jiffies delay between kupdate flushes */
123 int age_buffer
; /* Time for normal buffer to age before we flush it */
124 int age_super
; /* Time for superblock to age before we flush it */
125 int dummy2
; /* unused */
126 int dummy3
; /* unused */
128 unsigned int data
[N_PARAM
];
129 } bdf_prm
= {{40, 500, 64, 256, 5*HZ
, 30*HZ
, 5*HZ
, 1884, 2}};
131 /* These are the min and max parameter values that we will allow to be assigned */
132 int bdflush_min
[N_PARAM
] = { 0, 10, 5, 25, 0, 1*HZ
, 1*HZ
, 1, 1};
133 int bdflush_max
[N_PARAM
] = {100,50000, 20000, 20000,600*HZ
, 6000*HZ
, 6000*HZ
, 2047, 5};
136 * Rewrote the wait-routines to use the "new" wait-queue functionality,
137 * and getting rid of the cli-sti pairs. The wait-queue routines still
138 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 * Note that the real wait_on_buffer() is an inline function that checks
141 * if 'b_wait' is set before calling this, so that the queues aren't set
144 void __wait_on_buffer(struct buffer_head
* bh
)
146 struct task_struct
*tsk
= current
;
147 DECLARE_WAITQUEUE(wait
, tsk
);
149 atomic_inc(&bh
->b_count
);
150 add_wait_queue(&bh
->b_wait
, &wait
);
152 run_task_queue(&tq_disk
);
153 set_task_state(tsk
, TASK_UNINTERRUPTIBLE
);
154 if (!buffer_locked(bh
))
157 } while (buffer_locked(bh
));
158 tsk
->state
= TASK_RUNNING
;
159 remove_wait_queue(&bh
->b_wait
, &wait
);
160 atomic_dec(&bh
->b_count
);
163 /* Call sync_buffers with wait!=0 to ensure that the call does not
164 * return until all buffer writes have completed. Sync() may return
165 * before the writes have finished; fsync() may not.
168 /* Godamity-damn. Some buffers (bitmaps for filesystems)
169 * spontaneously dirty themselves without ever brelse being called.
170 * We will ultimately want to put these in a separate list, but for
171 * now we search all of the lists for dirty buffers.
173 static int sync_buffers(kdev_t dev
, int wait
)
175 int i
, retry
, pass
= 0, err
= 0;
176 struct buffer_head
* bh
, *next
;
178 /* One pass for no-wait, three for wait:
179 * 0) write out all dirty, unlocked buffers;
180 * 1) write out all dirty buffers, waiting if locked;
181 * 2) wait for completion by waiting for all buffers to unlock.
186 /* We search all lists as a failsafe mechanism, not because we expect
187 * there to be dirty buffers on any of the other lists.
190 spin_lock(&lru_list_lock
);
191 bh
= lru_list
[BUF_DIRTY
];
195 for (i
= nr_buffers_type
[BUF_DIRTY
]*2 ; i
-- > 0 ; bh
= next
) {
196 next
= bh
->b_next_free
;
198 if (!lru_list
[BUF_DIRTY
])
200 if (dev
&& bh
->b_dev
!= dev
)
202 if (buffer_locked(bh
)) {
203 /* Buffer is locked; skip it unless wait is
204 * requested AND pass > 0.
206 if (!wait
|| !pass
) {
210 atomic_inc(&bh
->b_count
);
211 spin_unlock(&lru_list_lock
);
213 atomic_dec(&bh
->b_count
);
217 /* If an unlocked buffer is not uptodate, there has
218 * been an IO error. Skip it.
220 if (wait
&& buffer_req(bh
) && !buffer_locked(bh
) &&
221 !buffer_dirty(bh
) && !buffer_uptodate(bh
)) {
226 /* Don't write clean buffers. Don't write ANY buffers
229 if (!buffer_dirty(bh
) || pass
>= 2)
232 atomic_inc(&bh
->b_count
);
233 spin_unlock(&lru_list_lock
);
234 ll_rw_block(WRITE
, 1, &bh
);
235 atomic_dec(&bh
->b_count
);
241 bh
= lru_list
[BUF_LOCKED
];
243 spin_unlock(&lru_list_lock
);
246 for (i
= nr_buffers_type
[BUF_LOCKED
]*2 ; i
-- > 0 ; bh
= next
) {
247 next
= bh
->b_next_free
;
249 if (!lru_list
[BUF_LOCKED
])
251 if (dev
&& bh
->b_dev
!= dev
)
253 if (buffer_locked(bh
)) {
254 /* Buffer is locked; skip it unless wait is
255 * requested AND pass > 0.
257 if (!wait
|| !pass
) {
261 atomic_inc(&bh
->b_count
);
262 spin_unlock(&lru_list_lock
);
264 spin_lock(&lru_list_lock
);
265 atomic_dec(&bh
->b_count
);
269 spin_unlock(&lru_list_lock
);
271 /* If we are waiting for the sync to succeed, and if any dirty
272 * blocks were written, then repeat; on the second pass, only
273 * wait for buffers being written (do not pass to write any
274 * more buffers on the second pass).
276 } while (wait
&& retry
&& ++pass
<=2);
280 void sync_dev(kdev_t dev
)
285 /* sync all the dirty buffers out to disk only _after_ all the
286 high level layers finished generated buffer dirty data
287 (or we'll return with some buffer still dirty on the blockdevice
288 so breaking the semantics of this call) */
289 sync_buffers(dev
, 0);
291 * FIXME(eric) we need to sync the physical devices here.
292 * This is because some (scsi) controllers have huge amounts of
293 * cache onboard (hundreds of Mb), and we need to instruct
294 * them to commit all of the dirty memory to disk, and we should
295 * not return until this has happened.
297 * This would need to get implemented by going through the assorted
298 * layers so that each block major number can be synced, and this
299 * would call down into the upper and mid-layer scsi.
303 int fsync_dev(kdev_t dev
)
305 sync_buffers(dev
, 0);
313 return sync_buffers(dev
, 1);
316 asmlinkage
long sys_sync(void)
323 * filp may be NULL if called via the msync of a vma.
326 int file_fsync(struct file
*filp
, struct dentry
*dentry
)
328 struct inode
* inode
= dentry
->d_inode
;
329 struct super_block
* sb
;
334 /* sync the inode to buffers */
335 write_inode_now(inode
);
337 /* sync the superblock to buffers */
340 if (sb
->s_op
&& sb
->s_op
->write_super
)
341 sb
->s_op
->write_super(sb
);
343 /* .. finally sync the buffers to disk */
345 ret
= sync_buffers(dev
, 1);
350 asmlinkage
long sys_fsync(unsigned int fd
)
353 struct dentry
* dentry
;
354 struct inode
* inode
;
362 dentry
= file
->f_dentry
;
366 inode
= dentry
->d_inode
;
371 if (!file
->f_op
|| !file
->f_op
->fsync
)
374 /* We need to protect against concurrent writers.. */
376 err
= file
->f_op
->fsync(file
, dentry
);
385 asmlinkage
long sys_fdatasync(unsigned int fd
)
388 struct dentry
* dentry
;
389 struct inode
* inode
;
397 dentry
= file
->f_dentry
;
401 inode
= dentry
->d_inode
;
406 if (!file
->f_op
|| !file
->f_op
->fsync
)
409 /* this needs further work, at the moment it is identical to fsync() */
411 err
= file
->f_op
->fsync(file
, dentry
);
420 /* After several hours of tedious analysis, the following hash
421 * function won. Do not mess with it... -DaveM
423 #define _hashfn(dev,block) \
424 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
425 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
426 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
428 static __inline__
void __hash_link(struct buffer_head
*bh
, struct buffer_head
**head
)
430 if ((bh
->b_next
= *head
) != NULL
)
431 bh
->b_next
->b_pprev
= &bh
->b_next
;
436 static __inline__
void __hash_unlink(struct buffer_head
*bh
)
440 bh
->b_next
->b_pprev
= bh
->b_pprev
;
441 *(bh
->b_pprev
) = bh
->b_next
;
446 static void __insert_into_lru_list(struct buffer_head
* bh
, int blist
)
448 struct buffer_head
**bhp
= &lru_list
[blist
];
452 bh
->b_prev_free
= bh
;
454 bh
->b_next_free
= *bhp
;
455 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
456 (*bhp
)->b_prev_free
->b_next_free
= bh
;
457 (*bhp
)->b_prev_free
= bh
;
458 nr_buffers_type
[blist
]++;
459 size_buffers_type
[blist
] += bh
->b_size
;
462 static void __remove_from_lru_list(struct buffer_head
* bh
, int blist
)
464 if (bh
->b_prev_free
|| bh
->b_next_free
) {
465 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
466 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
467 if (lru_list
[blist
] == bh
)
468 lru_list
[blist
] = bh
->b_next_free
;
469 if (lru_list
[blist
] == bh
)
470 lru_list
[blist
] = NULL
;
471 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
472 nr_buffers_type
[blist
]--;
473 size_buffers_type
[blist
] -= bh
->b_size
;
477 static void __remove_from_free_list(struct buffer_head
* bh
, int index
)
479 if(bh
->b_next_free
== bh
)
480 free_list
[index
].list
= NULL
;
482 bh
->b_prev_free
->b_next_free
= bh
->b_next_free
;
483 bh
->b_next_free
->b_prev_free
= bh
->b_prev_free
;
484 if (free_list
[index
].list
== bh
)
485 free_list
[index
].list
= bh
->b_next_free
;
487 bh
->b_next_free
= bh
->b_prev_free
= NULL
;
490 /* must be called with both the hash_table_lock and the lru_list_lock
492 static void __remove_from_queues(struct buffer_head
*bh
)
495 __remove_from_lru_list(bh
, bh
->b_list
);
498 static void insert_into_queues(struct buffer_head
*bh
)
500 struct buffer_head
**head
= &hash(bh
->b_dev
, bh
->b_blocknr
);
502 spin_lock(&lru_list_lock
);
503 write_lock(&hash_table_lock
);
504 __hash_link(bh
, head
);
505 __insert_into_lru_list(bh
, bh
->b_list
);
506 write_unlock(&hash_table_lock
);
507 spin_unlock(&lru_list_lock
);
510 /* This function must only run if there are no other
511 * references _anywhere_ to this buffer head.
513 static void put_last_free(struct buffer_head
* bh
)
515 struct bh_free_head
*head
= &free_list
[BUFSIZE_INDEX(bh
->b_size
)];
516 struct buffer_head
**bhp
= &head
->list
;
520 spin_lock(&head
->lock
);
524 bh
->b_prev_free
= bh
;
526 bh
->b_next_free
= *bhp
;
527 bh
->b_prev_free
= (*bhp
)->b_prev_free
;
528 (*bhp
)->b_prev_free
->b_next_free
= bh
;
529 (*bhp
)->b_prev_free
= bh
;
530 spin_unlock(&head
->lock
);
534 * Why like this, I hear you say... The reason is race-conditions.
535 * As we don't lock buffers (unless we are reading them, that is),
536 * something might happen to it while we sleep (ie a read-error
537 * will force it bad). This shouldn't really happen currently, but
540 struct buffer_head
* get_hash_table(kdev_t dev
, int block
, int size
)
542 struct buffer_head
**head
= &hash(dev
, block
);
543 struct buffer_head
*bh
;
545 read_lock(&hash_table_lock
);
546 for(bh
= *head
; bh
; bh
= bh
->b_next
)
547 if (bh
->b_blocknr
== block
&&
548 bh
->b_size
== size
&&
552 atomic_inc(&bh
->b_count
);
553 read_unlock(&hash_table_lock
);
558 unsigned int get_hardblocksize(kdev_t dev
)
561 * Get the hard sector size for the given device. If we don't know
562 * what it is, return 0.
564 if (hardsect_size
[MAJOR(dev
)] != NULL
) {
565 int blksize
= hardsect_size
[MAJOR(dev
)][MINOR(dev
)];
571 * We don't know what the hardware sector size for this device is.
572 * Return 0 indicating that we don't know.
577 /* If invalidate_buffers() will trash dirty buffers, it means some kind
578 of fs corruption is going on. Trashing dirty data always imply losing
579 information that was supposed to be just stored on the physical layer
582 Thus invalidate_buffers in general usage is not allwowed to trash dirty
583 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
585 NOTE: In the case where the user removed a removable-media-disk even if
586 there's still dirty data not synced on disk (due a bug in the device driver
587 or due an error of the user), by not destroying the dirty buffers we could
588 generate corruption also on the next media inserted, thus a parameter is
589 necessary to handle this case in the most safe way possible (trying
590 to not corrupt also the new disk inserted with the data belonging to
591 the old now corrupted disk). Also for the ramdisk the natural thing
592 to do in order to release the ramdisk memory is to destroy dirty buffers.
594 These are two special cases. Normal usage imply the device driver
595 to issue a sync on the device (without waiting I/O completation) and
596 then an invalidate_buffers call that doesn't trashes dirty buffers. */
597 void __invalidate_buffers(kdev_t dev
, int destroy_dirty_buffers
)
600 struct buffer_head
* bh
, * bh_next
;
604 spin_lock(&lru_list_lock
);
605 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
606 bh
= lru_list
[nlist
];
609 for (i
= nr_buffers_type
[nlist
]; i
> 0 ; bh
= bh_next
, i
--) {
610 bh_next
= bh
->b_next_free
;
611 if (bh
->b_dev
!= dev
)
613 if (buffer_locked(bh
)) {
614 atomic_inc(&bh
->b_count
);
615 spin_unlock(&lru_list_lock
);
618 spin_lock(&lru_list_lock
);
619 atomic_dec(&bh
->b_count
);
622 write_lock(&hash_table_lock
);
623 if (!atomic_read(&bh
->b_count
) &&
624 (destroy_dirty_buffers
|| !buffer_dirty(bh
))) {
625 __remove_from_queues(bh
);
628 write_unlock(&hash_table_lock
);
634 spin_unlock(&lru_list_lock
);
639 void set_blocksize(kdev_t dev
, int size
)
641 extern int *blksize_size
[];
643 struct buffer_head
* bh
, * bh_next
;
645 if (!blksize_size
[MAJOR(dev
)])
648 /* Size must be a power of two, and between 512 and PAGE_SIZE */
649 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
650 panic("Invalid blocksize passed to set_blocksize");
652 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == 0 && size
== BLOCK_SIZE
) {
653 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
656 if (blksize_size
[MAJOR(dev
)][MINOR(dev
)] == size
)
658 sync_buffers(dev
, 2);
659 blksize_size
[MAJOR(dev
)][MINOR(dev
)] = size
;
663 spin_lock(&lru_list_lock
);
664 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
665 bh
= lru_list
[nlist
];
668 for (i
= nr_buffers_type
[nlist
]; i
> 0 ; bh
= bh_next
, i
--) {
669 bh_next
= bh
->b_next_free
;
670 if (bh
->b_dev
!= dev
|| bh
->b_size
== size
)
672 if (buffer_locked(bh
)) {
673 atomic_inc(&bh
->b_count
);
674 spin_unlock(&lru_list_lock
);
677 spin_lock(&lru_list_lock
);
678 atomic_dec(&bh
->b_count
);
681 write_lock(&hash_table_lock
);
682 if (!atomic_read(&bh
->b_count
)) {
683 if (buffer_dirty(bh
))
685 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
686 kdevname(dev
), bh
->b_blocknr
, bh
->b_size
);
687 __remove_from_queues(bh
);
690 if (atomic_set_buffer_clean(bh
))
692 clear_bit(BH_Uptodate
, &bh
->b_state
);
695 "b_count %d, dev %s, block %lu, from %p\n",
696 atomic_read(&bh
->b_count
), bdevname(bh
->b_dev
),
697 bh
->b_blocknr
, __builtin_return_address(0));
699 write_unlock(&hash_table_lock
);
705 spin_unlock(&lru_list_lock
);
711 * We used to try various strange things. Let's not.
713 static void refill_freelist(int size
)
715 if (!grow_buffers(size
)) {
717 current
->policy
|= SCHED_YIELD
;
722 void init_buffer(struct buffer_head
*bh
, bh_end_io_t
*handler
, void *dev_id
)
724 bh
->b_list
= BUF_CLEAN
;
725 bh
->b_end_io
= handler
;
726 bh
->b_dev_id
= dev_id
;
729 static void end_buffer_io_sync(struct buffer_head
*bh
, int uptodate
)
731 mark_buffer_uptodate(bh
, uptodate
);
735 static void end_buffer_io_bad(struct buffer_head
*bh
, int uptodate
)
737 mark_buffer_uptodate(bh
, uptodate
);
742 static void end_buffer_io_async(struct buffer_head
* bh
, int uptodate
)
744 static spinlock_t page_uptodate_lock
= SPIN_LOCK_UNLOCKED
;
746 struct buffer_head
*tmp
;
749 mark_buffer_uptodate(bh
, uptodate
);
751 /* This is a temporary buffer used for page I/O. */
758 * Be _very_ careful from here on. Bad things can happen if
759 * two buffer heads end IO at almost the same time and both
760 * decide that the page is now completely done.
762 * Async buffer_heads are here only as labels for IO, and get
763 * thrown away once the IO for this page is complete. IO is
764 * deemed complete once all buffers have been visited
765 * (b_count==0) and are now unlocked. We must make sure that
766 * only the _last_ buffer that decrements its count is the one
767 * that unlock the page..
769 spin_lock_irqsave(&page_uptodate_lock
, flags
);
771 atomic_dec(&bh
->b_count
);
772 tmp
= bh
->b_this_page
;
774 if (tmp
->b_end_io
== end_buffer_io_async
&& buffer_locked(tmp
))
776 tmp
= tmp
->b_this_page
;
779 /* OK, the async IO on this page is complete. */
780 spin_unlock_irqrestore(&page_uptodate_lock
, flags
);
783 * if none of the buffers had errors then we can set the
786 if (!PageError(page
))
787 SetPageUptodate(page
);
790 * Run the hooks that have to be done when a page I/O has completed.
792 if (PageTestandClearDecrAfter(page
))
793 atomic_dec(&nr_async_pages
);
800 spin_unlock_irqrestore(&page_uptodate_lock
, flags
);
805 * Ok, this is getblk, and it isn't very clear, again to hinder
806 * race-conditions. Most of the code is seldom used, (ie repeating),
807 * so it should be much more efficient than it looks.
809 * The algorithm is changed: hopefully better, and an elusive bug removed.
811 * 14.02.92: changed it to sync dirty buffers a bit: better performance
812 * when the filesystem starts to get full of dirty blocks (I hope).
814 struct buffer_head
* getblk(kdev_t dev
, int block
, int size
)
816 struct buffer_head
* bh
;
820 bh
= get_hash_table(dev
, block
, size
);
824 isize
= BUFSIZE_INDEX(size
);
825 spin_lock(&free_list
[isize
].lock
);
826 bh
= free_list
[isize
].list
;
828 __remove_from_free_list(bh
, isize
);
829 atomic_set(&bh
->b_count
, 1);
831 spin_unlock(&free_list
[isize
].lock
);
834 * OK, FINALLY we know that this buffer is the only one of
835 * its kind, we hold a reference (b_count>0), it is unlocked,
839 init_buffer(bh
, end_buffer_io_sync
, NULL
);
841 bh
->b_blocknr
= block
;
842 bh
->b_state
= 1 << BH_Mapped
;
844 /* Insert the buffer into the regular lists */
845 insert_into_queues(bh
);
852 * If we block while refilling the free list, somebody may
853 * create the buffer first ... search the hashes again.
855 refill_freelist(size
);
859 /* -1 -> no need to flush
861 1 -> sync flush (wait for I/O completation) */
862 static int balance_dirty_state(kdev_t dev
)
864 unsigned long dirty
, tot
, hard_dirty_limit
, soft_dirty_limit
;
866 dirty
= size_buffers_type
[BUF_DIRTY
] >> PAGE_SHIFT
;
867 tot
= nr_free_buffer_pages();
868 tot
-= size_buffers_type
[BUF_PROTECTED
] >> PAGE_SHIFT
;
871 soft_dirty_limit
= tot
* bdf_prm
.b_un
.nfract
;
872 hard_dirty_limit
= soft_dirty_limit
* 2;
874 if (dirty
> soft_dirty_limit
) {
875 if (dirty
> hard_dirty_limit
)
883 * if a new dirty buffer is created we need to balance bdflush.
885 * in the future we might want to make bdflush aware of different
886 * pressures on different devices - thus the (currently unused)
889 void balance_dirty(kdev_t dev
)
891 int state
= balance_dirty_state(dev
);
895 wakeup_bdflush(state
);
898 static __inline__
void __mark_dirty(struct buffer_head
*bh
, int flag
)
900 bh
->b_flushtime
= jiffies
+ (flag
? bdf_prm
.b_un
.age_super
: bdf_prm
.b_un
.age_buffer
);
904 /* atomic version, the user must call balance_dirty() by hand
905 as soon as it become possible to block */
906 void __mark_buffer_dirty(struct buffer_head
*bh
, int flag
)
908 if (!atomic_set_buffer_dirty(bh
))
909 __mark_dirty(bh
, flag
);
912 void mark_buffer_dirty(struct buffer_head
*bh
, int flag
)
914 __mark_buffer_dirty(bh
, flag
);
915 balance_dirty(bh
->b_dev
);
919 * A buffer may need to be moved from one buffer list to another
920 * (e.g. in case it is not shared any more). Handle this.
922 static void __refile_buffer(struct buffer_head
*bh
)
924 int dispose
= BUF_CLEAN
;
925 if (buffer_locked(bh
))
926 dispose
= BUF_LOCKED
;
927 if (buffer_dirty(bh
))
929 if (buffer_protected(bh
))
930 dispose
= BUF_PROTECTED
;
931 if (dispose
!= bh
->b_list
) {
932 __remove_from_lru_list(bh
, bh
->b_list
);
933 bh
->b_list
= dispose
;
934 __insert_into_lru_list(bh
, dispose
);
938 void refile_buffer(struct buffer_head
*bh
)
940 spin_lock(&lru_list_lock
);
942 spin_unlock(&lru_list_lock
);
946 * Release a buffer head
948 void __brelse(struct buffer_head
* buf
)
950 if (atomic_read(&buf
->b_count
)) {
951 atomic_dec(&buf
->b_count
);
954 printk("VFS: brelse: Trying to free free buffer\n");
958 * bforget() is like brelse(), except it puts the buffer on the
959 * free list if it can.. We can NOT free the buffer if:
960 * - there are other users of it
961 * - it is locked and thus can have active IO
963 void __bforget(struct buffer_head
* buf
)
965 /* grab the lru lock here to block bdflush. */
966 spin_lock(&lru_list_lock
);
967 write_lock(&hash_table_lock
);
968 if (!atomic_dec_and_test(&buf
->b_count
) || buffer_locked(buf
))
971 write_unlock(&hash_table_lock
);
972 __remove_from_lru_list(buf
, buf
->b_list
);
973 spin_unlock(&lru_list_lock
);
978 write_unlock(&hash_table_lock
);
979 spin_unlock(&lru_list_lock
);
983 * bread() reads a specified block and returns the buffer that contains
984 * it. It returns NULL if the block was unreadable.
986 struct buffer_head
* bread(kdev_t dev
, int block
, int size
)
988 struct buffer_head
* bh
;
990 bh
= getblk(dev
, block
, size
);
991 if (buffer_uptodate(bh
))
993 ll_rw_block(READ
, 1, &bh
);
995 if (buffer_uptodate(bh
))
1002 * Ok, breada can be used as bread, but additionally to mark other
1003 * blocks for reading as well. End the argument list with a negative
1009 struct buffer_head
* breada(kdev_t dev
, int block
, int bufsize
,
1010 unsigned int pos
, unsigned int filesize
)
1012 struct buffer_head
* bhlist
[NBUF
];
1013 unsigned int blocks
;
1014 struct buffer_head
* bh
;
1018 if (pos
>= filesize
)
1024 bh
= getblk(dev
, block
, bufsize
);
1025 index
= BUFSIZE_INDEX(bh
->b_size
);
1027 if (buffer_uptodate(bh
))
1029 else ll_rw_block(READ
, 1, &bh
);
1031 blocks
= (filesize
- pos
) >> (9+index
);
1033 if (blocks
< (read_ahead
[MAJOR(dev
)] >> index
))
1034 blocks
= read_ahead
[MAJOR(dev
)] >> index
;
1038 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1042 for(i
=1; i
<blocks
; i
++) {
1043 bh
= getblk(dev
,block
+i
,bufsize
);
1044 if (buffer_uptodate(bh
)) {
1048 else bhlist
[j
++] = bh
;
1051 /* Request the read for these buffers, and then release them. */
1053 ll_rw_block(READA
, (j
-1), bhlist
+1);
1057 /* Wait for this buffer, and then continue on. */
1060 if (buffer_uptodate(bh
))
1067 * Note: the caller should wake up the buffer_wait list if needed.
1069 static __inline__
void __put_unused_buffer_head(struct buffer_head
* bh
)
1071 if (nr_unused_buffer_heads
>= MAX_UNUSED_BUFFERS
) {
1072 kmem_cache_free(bh_cachep
, bh
);
1075 init_waitqueue_head(&bh
->b_wait
);
1076 nr_unused_buffer_heads
++;
1077 bh
->b_next_free
= unused_list
;
1078 bh
->b_this_page
= NULL
;
1084 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1085 * no-buffer-head deadlock. Return NULL on failure; waiting for
1086 * buffer heads is now handled in create_buffers().
1088 static struct buffer_head
* get_unused_buffer_head(int async
)
1090 struct buffer_head
* bh
;
1092 spin_lock(&unused_list_lock
);
1093 if (nr_unused_buffer_heads
> NR_RESERVED
) {
1095 unused_list
= bh
->b_next_free
;
1096 nr_unused_buffer_heads
--;
1097 spin_unlock(&unused_list_lock
);
1100 spin_unlock(&unused_list_lock
);
1102 /* This is critical. We can't swap out pages to get
1103 * more buffer heads, because the swap-out may need
1104 * more buffer-heads itself. Thus SLAB_BUFFER.
1106 if((bh
= kmem_cache_alloc(bh_cachep
, SLAB_BUFFER
)) != NULL
) {
1107 memset(bh
, 0, sizeof(*bh
));
1108 init_waitqueue_head(&bh
->b_wait
);
1113 * If we need an async buffer, use the reserved buffer heads.
1116 spin_lock(&unused_list_lock
);
1119 unused_list
= bh
->b_next_free
;
1120 nr_unused_buffer_heads
--;
1121 spin_unlock(&unused_list_lock
);
1124 spin_unlock(&unused_list_lock
);
1128 * (Pending further analysis ...)
1129 * Ordinary (non-async) requests can use a different memory priority
1130 * to free up pages. Any swapping thus generated will use async
1134 (bh
= kmem_cache_alloc(bh_cachep
, SLAB_KERNEL
)) != NULL
) {
1135 memset(bh
, 0, sizeof(*bh
));
1136 init_waitqueue_head(&bh
->b_wait
);
1144 void set_bh_page (struct buffer_head
*bh
, struct page
*page
, unsigned long offset
)
1147 if (offset
>= PAGE_SIZE
)
1149 if (PageHighMem(page
))
1151 * This catches illegal uses and preserves the offset:
1153 bh
->b_data
= (char *)(0 + offset
);
1155 bh
->b_data
= (char *)(page_address(page
) + offset
);
1159 * Create the appropriate buffers when given a page for data area and
1160 * the size of each buffer.. Use the bh->b_this_page linked list to
1161 * follow the buffers created. Return NULL if unable to create more
1163 * The async flag is used to differentiate async IO (paging, swapping)
1164 * from ordinary buffer allocations, and only async requests are allowed
1165 * to sleep waiting for buffer heads.
1167 static struct buffer_head
* create_buffers(struct page
* page
, unsigned long size
, int async
)
1169 struct buffer_head
*bh
, *head
;
1175 while ((offset
-= size
) >= 0) {
1176 bh
= get_unused_buffer_head(async
);
1180 bh
->b_dev
= B_FREE
; /* Flag as unused */
1181 bh
->b_this_page
= head
;
1185 bh
->b_next_free
= NULL
;
1187 atomic_set(&bh
->b_count
, 0);
1190 set_bh_page(bh
, page
, offset
);
1192 bh
->b_list
= BUF_CLEAN
;
1193 bh
->b_end_io
= end_buffer_io_bad
;
1197 * In case anything failed, we just free everything we got.
1201 spin_lock(&unused_list_lock
);
1204 head
= head
->b_this_page
;
1205 __put_unused_buffer_head(bh
);
1207 spin_unlock(&unused_list_lock
);
1209 /* Wake up any waiters ... */
1210 wake_up(&buffer_wait
);
1214 * Return failure for non-async IO requests. Async IO requests
1215 * are not allowed to fail, so we have to wait until buffer heads
1216 * become available. But we don't want tasks sleeping with
1217 * partially complete buffers, so all were released above.
1222 /* We're _really_ low on memory. Now we just
1223 * wait for old buffer heads to become free due to
1224 * finishing IO. Since this is an async request and
1225 * the reserve list is empty, we're sure there are
1226 * async buffer heads in use.
1228 run_task_queue(&tq_disk
);
1231 * Set our state for sleeping, then check again for buffer heads.
1232 * This ensures we won't miss a wake_up from an interrupt.
1234 wait_event(buffer_wait
, nr_unused_buffer_heads
>= MAX_BUF_PER_PAGE
);
1238 static int create_page_buffers(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
)
1240 struct buffer_head
*head
, *bh
, *tail
;
1243 if (!PageLocked(page
))
1246 * Allocate async buffer heads pointing to this page, just for I/O.
1247 * They don't show up in the buffer hash table, but they *are*
1248 * registered in page->buffers.
1250 head
= create_buffers(page
, size
, 1);
1256 for (bh
= head
; bh
; bh
= bh
->b_this_page
) {
1260 init_buffer(bh
, end_buffer_io_async
, NULL
);
1262 bh
->b_blocknr
= block
;
1264 set_bit(BH_Mapped
, &bh
->b_state
);
1266 tail
->b_this_page
= head
;
1267 page_cache_get(page
);
1268 page
->buffers
= head
;
1272 static void unmap_buffer(struct buffer_head
* bh
)
1274 if (buffer_mapped(bh
)) {
1275 mark_buffer_clean(bh
);
1277 clear_bit(BH_Uptodate
, &bh
->b_state
);
1278 clear_bit(BH_Mapped
, &bh
->b_state
);
1279 clear_bit(BH_Req
, &bh
->b_state
);
1280 clear_bit(BH_New
, &bh
->b_state
);
1285 * We don't have to release all buffers here, but
1286 * we have to be sure that no dirty buffer is left
1287 * and no IO is going on (no buffer is locked), because
1288 * we have truncated the file and are going to free the
1291 int block_flushpage(struct page
*page
, unsigned long offset
)
1293 struct buffer_head
*head
, *bh
, *next
;
1294 unsigned int curr_off
= 0;
1296 if (!PageLocked(page
))
1301 head
= page
->buffers
;
1304 unsigned int next_off
= curr_off
+ bh
->b_size
;
1305 next
= bh
->b_this_page
;
1308 * is this block fully flushed?
1310 if (offset
<= curr_off
)
1312 curr_off
= next_off
;
1314 } while (bh
!= head
);
1317 * subtle. We release buffer-heads only if this is
1318 * the 'final' flushpage. We have invalidated the get_block
1319 * cached value unconditionally, so real IO is not
1322 * If the free doesn't work out, the buffers can be
1323 * left around - they just turn into anonymous buffers
1327 if (!try_to_free_buffers(page
)) {
1328 atomic_inc(&buffermem_pages
);
1336 static void create_empty_buffers(struct page
*page
, struct inode
*inode
, unsigned long blocksize
)
1338 struct buffer_head
*bh
, *head
, *tail
;
1340 head
= create_buffers(page
, blocksize
, 1);
1346 bh
->b_dev
= inode
->i_dev
;
1348 bh
->b_end_io
= end_buffer_io_bad
;
1350 bh
= bh
->b_this_page
;
1352 tail
->b_this_page
= head
;
1353 page
->buffers
= head
;
1354 page_cache_get(page
);
1357 static void unmap_underlying_metadata(struct buffer_head
* bh
)
1359 struct buffer_head
*old_bh
;
1361 old_bh
= get_hash_table(bh
->b_dev
, bh
->b_blocknr
, bh
->b_size
);
1363 unmap_buffer(old_bh
);
1364 /* Here we could run brelse or bforget. We use
1365 bforget because it will try to put the buffer
1372 * block_write_full_page() is SMP-safe - currently it's still
1373 * being called with the kernel lock held, but the code is ready.
1375 static int __block_write_full_page(struct inode
*inode
, struct page
*page
, get_block_t
*get_block
)
1377 int err
, i
, need_balance_dirty
= 0;
1378 unsigned long block
;
1379 struct buffer_head
*bh
, *head
;
1381 if (!PageLocked(page
))
1385 create_empty_buffers(page
, inode
, inode
->i_sb
->s_blocksize
);
1386 head
= page
->buffers
;
1388 block
= page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_sb
->s_blocksize_bits
);
1394 * If the buffer isn't up-to-date, we can't be sure
1395 * that the buffer has been initialized with the proper
1396 * block number information etc..
1398 * Leave it to the low-level FS to make all those
1399 * decisions (block #0 may actually be a valid block)
1401 bh
->b_end_io
= end_buffer_io_sync
;
1402 if (!buffer_mapped(bh
)) {
1403 err
= get_block(inode
, block
, bh
, 1);
1407 unmap_underlying_metadata(bh
);
1409 set_bit(BH_Uptodate
, &bh
->b_state
);
1410 if (!atomic_set_buffer_dirty(bh
)) {
1411 __mark_dirty(bh
, 0);
1412 need_balance_dirty
= 1;
1415 bh
= bh
->b_this_page
;
1417 } while (bh
!= head
);
1419 if (need_balance_dirty
)
1420 balance_dirty(bh
->b_dev
);
1422 SetPageUptodate(page
);
1425 ClearPageUptodate(page
);
1429 static int __block_prepare_write(struct inode
*inode
, struct page
*page
,
1430 unsigned from
, unsigned to
, get_block_t
*get_block
)
1432 unsigned block_start
, block_end
;
1433 unsigned long block
;
1435 unsigned blocksize
, bbits
;
1436 struct buffer_head
*bh
, *head
, *wait
[2], **wait_bh
=wait
;
1437 char *kaddr
= (char *)kmap(page
);
1439 blocksize
= inode
->i_sb
->s_blocksize
;
1441 create_empty_buffers(page
, inode
, blocksize
);
1442 head
= page
->buffers
;
1444 bbits
= inode
->i_sb
->s_blocksize_bits
;
1445 block
= page
->index
<< (PAGE_CACHE_SHIFT
- bbits
);
1447 for(bh
= head
, block_start
= 0; bh
!= head
|| !block_start
;
1448 block
++, block_start
=block_end
, bh
= bh
->b_this_page
) {
1451 block_end
= block_start
+blocksize
;
1452 if (block_end
<= from
)
1454 if (block_start
>= to
)
1456 bh
->b_end_io
= end_buffer_io_sync
;
1457 if (!buffer_mapped(bh
)) {
1458 err
= get_block(inode
, block
, bh
, 1);
1461 if (buffer_new(bh
)) {
1462 unmap_underlying_metadata(bh
);
1464 memset(kaddr
+to
, 0, block_end
-to
);
1465 if (block_start
< from
)
1466 memset(kaddr
+block_start
, 0, from
-block_start
);
1470 if (!buffer_uptodate(bh
) &&
1471 (block_start
< from
|| block_end
> to
)) {
1472 ll_rw_block(READ
, 1, &bh
);
1477 * If we issued read requests - let them complete.
1479 while(wait_bh
> wait
) {
1480 wait_on_buffer(*--wait_bh
);
1482 if (!buffer_uptodate(*wait_bh
))
1490 static int __block_commit_write(struct inode
*inode
, struct page
*page
,
1491 unsigned from
, unsigned to
)
1493 unsigned block_start
, block_end
;
1494 int partial
= 0, need_balance_dirty
= 0;
1496 struct buffer_head
*bh
, *head
;
1498 blocksize
= inode
->i_sb
->s_blocksize
;
1500 for(bh
= head
= page
->buffers
, block_start
= 0;
1501 bh
!= head
|| !block_start
;
1502 block_start
=block_end
, bh
= bh
->b_this_page
) {
1503 block_end
= block_start
+ blocksize
;
1504 if (block_end
<= from
|| block_start
>= to
) {
1505 if (!buffer_uptodate(bh
))
1508 set_bit(BH_Uptodate
, &bh
->b_state
);
1509 if (!atomic_set_buffer_dirty(bh
)) {
1510 __mark_dirty(bh
, 0);
1511 need_balance_dirty
= 1;
1516 if (need_balance_dirty
)
1517 balance_dirty(bh
->b_dev
);
1519 * is this a partial write that happened to make all buffers
1520 * uptodate then we can optimize away a bogus readpage() for
1521 * the next read(). Here we 'discover' wether the page went
1522 * uptodate as a result of this (potentially partial) write.
1525 SetPageUptodate(page
);
1530 * Generic "read page" function for block devices that have the normal
1531 * get_block functionality. This is most of the block device filesystems.
1532 * Reads the page asynchronously --- the unlock_buffer() and
1533 * mark_buffer_uptodate() functions propagate buffer state into the
1534 * page struct once IO has completed.
1536 int block_read_full_page(struct page
*page
, get_block_t
*get_block
)
1538 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1539 unsigned long iblock
, lblock
;
1540 struct buffer_head
*bh
, *head
, *arr
[MAX_BUF_PER_PAGE
];
1541 unsigned int blocksize
, blocks
;
1542 unsigned long kaddr
= 0;
1545 if (!PageLocked(page
))
1547 blocksize
= inode
->i_sb
->s_blocksize
;
1549 create_empty_buffers(page
, inode
, blocksize
);
1550 head
= page
->buffers
;
1552 blocks
= PAGE_CACHE_SIZE
>> inode
->i_sb
->s_blocksize_bits
;
1553 iblock
= page
->index
<< (PAGE_CACHE_SHIFT
- inode
->i_sb
->s_blocksize_bits
);
1554 lblock
= (inode
->i_size
+blocksize
-1) >> inode
->i_sb
->s_blocksize_bits
;
1560 if (buffer_uptodate(bh
))
1563 if (!buffer_mapped(bh
)) {
1564 if (iblock
< lblock
)
1565 get_block(inode
, iblock
, bh
, 0);
1566 if (!buffer_mapped(bh
)) {
1569 memset((char *)(kaddr
+ i
*blocksize
), 0, blocksize
);
1570 set_bit(BH_Uptodate
, &bh
->b_state
);
1575 init_buffer(bh
, end_buffer_io_async
, NULL
);
1576 atomic_inc(&bh
->b_count
);
1579 } while (i
++, iblock
++, (bh
= bh
->b_this_page
) != head
);
1582 if (Page_Uptodate(page
))
1584 ll_rw_block(READ
, nr
, arr
);
1587 * all buffers are uptodate - we can set the page
1590 SetPageUptodate(page
);
1599 * For moronic filesystems that do not allow holes in file.
1600 * We may have to extend the file.
1603 int cont_prepare_write(struct page
*page
, unsigned offset
, unsigned to
, get_block_t
*get_block
, unsigned long *bytes
)
1605 struct address_space
*mapping
= page
->mapping
;
1606 struct inode
*inode
= (struct inode
*)mapping
->host
;
1607 struct page
*new_page
;
1608 unsigned long pgpos
;
1611 unsigned blocksize
= inode
->i_sb
->s_blocksize
;
1614 while(page
->index
> (pgpos
= *bytes
>>PAGE_CACHE_SHIFT
)) {
1616 new_page
= grab_cache_page(mapping
, pgpos
);
1619 /* we might sleep */
1620 if (*bytes
>>PAGE_CACHE_SHIFT
!= pgpos
) {
1621 UnlockPage(new_page
);
1622 page_cache_release(new_page
);
1625 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
;
1626 if (zerofrom
& (blocksize
-1)) {
1627 *bytes
|= (blocksize
-1);
1630 status
= __block_prepare_write(inode
, new_page
, zerofrom
,
1631 PAGE_CACHE_SIZE
, get_block
);
1634 kaddr
= (char*)page_address(page
);
1635 memset(kaddr
+zerofrom
, 0, PAGE_CACHE_SIZE
-zerofrom
);
1636 __block_commit_write(inode
, new_page
, zerofrom
, to
);
1638 UnlockPage(new_page
);
1639 page_cache_release(new_page
);
1642 if (page
->index
< pgpos
) {
1643 /* completely inside the area */
1646 /* page covers the boundary, find the boundary offset */
1647 zerofrom
= *bytes
& ~PAGE_CACHE_MASK
;
1649 /* if we will expand the thing last block will be filled */
1650 if (to
> zerofrom
&& (zerofrom
& (blocksize
-1))) {
1651 *bytes
|= (blocksize
-1);
1655 /* starting below the boundary? Nothing to zero out */
1656 if (offset
<= zerofrom
)
1659 status
= __block_prepare_write(inode
, page
, zerofrom
, to
, get_block
);
1662 kaddr
= (char*)page_address(page
);
1663 if (zerofrom
< offset
) {
1664 memset(kaddr
+zerofrom
, 0, offset
-zerofrom
);
1665 __block_commit_write(inode
, page
, zerofrom
, offset
);
1669 ClearPageUptodate(page
);
1674 ClearPageUptodate(new_page
);
1676 UnlockPage(new_page
);
1677 page_cache_release(new_page
);
1682 int block_prepare_write(struct page
*page
, unsigned from
, unsigned to
,
1683 get_block_t
*get_block
)
1685 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1686 int err
= __block_prepare_write(inode
, page
, from
, to
, get_block
);
1688 ClearPageUptodate(page
);
1694 int generic_commit_write(struct file
*file
, struct page
*page
,
1695 unsigned from
, unsigned to
)
1697 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1698 loff_t pos
= ((loff_t
)page
->index
<< PAGE_CACHE_SHIFT
) + to
;
1699 __block_commit_write(inode
,page
,from
,to
);
1701 if (pos
> inode
->i_size
)
1702 inode
->i_size
= pos
;
1706 int block_write_full_page(struct page
*page
, get_block_t
*get_block
)
1708 struct inode
*inode
= (struct inode
*)page
->mapping
->host
;
1709 unsigned long end_index
= inode
->i_size
>> PAGE_CACHE_SHIFT
;
1714 if (page
->index
< end_index
)
1715 return __block_write_full_page(inode
, page
, get_block
);
1717 /* things got complicated... */
1718 offset
= inode
->i_size
& (PAGE_CACHE_SIZE
-1);
1719 /* OK, are we completely out? */
1720 if (page
->index
>= end_index
+1 || !offset
)
1722 /* Sigh... will have to work, then... */
1723 err
= __block_prepare_write(inode
, page
, 0, offset
, get_block
);
1725 memset((char *)page_address(page
)+offset
, 0, PAGE_CACHE_SIZE
-offset
);
1726 __block_commit_write(inode
,page
,0,offset
);
1731 ClearPageUptodate(page
);
1735 int generic_block_bmap(struct address_space
*mapping
, long block
, get_block_t
*get_block
)
1737 struct buffer_head tmp
;
1738 struct inode
*inode
= (struct inode
*)mapping
->host
;
1741 get_block(inode
, block
, &tmp
, 0);
1742 return tmp
.b_blocknr
;
1746 * IO completion routine for a buffer_head being used for kiobuf IO: we
1747 * can't dispatch the kiobuf callback until io_count reaches 0.
1750 static void end_buffer_io_kiobuf(struct buffer_head
*bh
, int uptodate
)
1752 struct kiobuf
*kiobuf
;
1754 mark_buffer_uptodate(bh
, uptodate
);
1756 kiobuf
= bh
->b_kiobuf
;
1758 end_kio_request(kiobuf
, uptodate
);
1763 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1764 * for them to complete. Clean up the buffer_heads afterwards.
1767 static int do_kio(int rw
, int nr
, struct buffer_head
*bh
[], int size
)
1771 struct buffer_head
*tmp
;
1773 struct task_struct
*tsk
= current
;
1774 DECLARE_WAITQUEUE(wait
, tsk
);
1778 ll_rw_block(rw
, nr
, bh
);
1781 spin_lock(&unused_list_lock
);
1783 for (i
= nr
; --i
>= 0; ) {
1786 if (buffer_locked(tmp
)) {
1787 spin_unlock(&unused_list_lock
);
1788 wait_on_buffer(tmp
);
1789 spin_lock(&unused_list_lock
);
1792 if (!buffer_uptodate(tmp
)) {
1793 /* We are traversing bh'es in reverse order so
1794 clearing iosize on error calculates the
1795 amount of IO before the first error. */
1798 __put_unused_buffer_head(tmp
);
1801 spin_unlock(&unused_list_lock
);
1807 * Start I/O on a physical range of kernel memory, defined by a vector
1808 * of kiobuf structs (much like a user-space iovec list).
1810 * The kiobuf must already be locked for IO. IO is submitted
1811 * asynchronously: you need to check page->locked, page->uptodate, and
1812 * maybe wait on page->wait.
1814 * It is up to the caller to make sure that there are enough blocks
1815 * passed in to completely map the iobufs to disk.
1818 int brw_kiovec(int rw
, int nr
, struct kiobuf
*iovec
[],
1819 kdev_t dev
, unsigned long b
[], int size
)
1829 unsigned long blocknr
;
1830 struct kiobuf
* iobuf
= NULL
;
1832 struct buffer_head
*tmp
, *bh
[KIO_MAX_SECTORS
];
1838 * First, do some alignment and validity checks
1840 for (i
= 0; i
< nr
; i
++) {
1842 if ((iobuf
->offset
& (size
-1)) ||
1843 (iobuf
->length
& (size
-1)))
1845 if (!iobuf
->nr_pages
)
1846 panic("brw_kiovec: iobuf not initialised");
1850 * OK to walk down the iovec doing page IO on each page we find.
1852 bufind
= bhind
= transferred
= err
= 0;
1853 for (i
= 0; i
< nr
; i
++) {
1855 offset
= iobuf
->offset
;
1856 length
= iobuf
->length
;
1859 for (pageind
= 0; pageind
< iobuf
->nr_pages
; pageind
++) {
1860 map
= iobuf
->maplist
[pageind
];
1866 while (length
> 0) {
1867 blocknr
= b
[bufind
++];
1868 tmp
= get_unused_buffer_head(0);
1874 tmp
->b_dev
= B_FREE
;
1876 set_bh_page(tmp
, map
, offset
);
1877 tmp
->b_this_page
= tmp
;
1879 init_buffer(tmp
, end_buffer_io_kiobuf
, NULL
);
1881 tmp
->b_blocknr
= blocknr
;
1882 tmp
->b_state
= 1 << BH_Mapped
;
1883 tmp
->b_kiobuf
= iobuf
;
1886 set_bit(BH_Uptodate
, &tmp
->b_state
);
1887 set_bit(BH_Dirty
, &tmp
->b_state
);
1894 atomic_inc(&iobuf
->io_count
);
1897 * Start the IO if we have got too much
1899 if (bhind
>= KIO_MAX_SECTORS
) {
1900 err
= do_kio(rw
, bhind
, bh
, size
);
1908 if (offset
>= PAGE_SIZE
) {
1912 } /* End of block loop */
1913 } /* End of page loop */
1914 } /* End of iovec loop */
1916 /* Is there any IO still left to submit? */
1918 err
= do_kio(rw
, bhind
, bh
, size
);
1931 /* We got an error allocating the bh'es. Just free the current
1932 buffer_heads and exit. */
1933 spin_lock(&unused_list_lock
);
1934 for (i
= bhind
; --i
>= 0; ) {
1935 __put_unused_buffer_head(bh
[bhind
]);
1937 spin_unlock(&unused_list_lock
);
1942 * Start I/O on a page.
1943 * This function expects the page to be locked and may return
1944 * before I/O is complete. You then have to check page->locked,
1945 * page->uptodate, and maybe wait on page->wait.
1947 * brw_page() is SMP-safe, although it's being called with the
1948 * kernel lock held - but the code is ready.
1950 * FIXME: we need a swapper_inode->get_block function to remove
1951 * some of the bmap kludges and interface ugliness here.
1953 int brw_page(int rw
, struct page
*page
, kdev_t dev
, int b
[], int size
)
1955 struct buffer_head
*head
, *bh
, *arr
[MAX_BUF_PER_PAGE
];
1956 int nr
, fresh
/* temporary debugging flag */, block
;
1958 if (!PageLocked(page
))
1959 panic("brw_page: page not locked for I/O");
1960 // ClearPageError(page);
1962 * We pretty much rely on the page lock for this, because
1963 * create_page_buffers() might sleep.
1966 if (!page
->buffers
) {
1967 create_page_buffers(rw
, page
, dev
, b
, size
);
1973 head
= page
->buffers
;
1979 if (fresh
&& (atomic_read(&bh
->b_count
) != 0))
1984 if (!buffer_uptodate(bh
)) {
1986 atomic_inc(&bh
->b_count
);
1988 } else { /* WRITE */
1989 if (!bh
->b_blocknr
) {
1992 bh
->b_blocknr
= block
;
1997 set_bit(BH_Uptodate
, &bh
->b_state
);
1998 set_bit(BH_Dirty
, &bh
->b_state
);
2000 atomic_inc(&bh
->b_count
);
2002 bh
= bh
->b_this_page
;
2003 } while (bh
!= head
);
2004 if ((rw
== READ
) && nr
) {
2005 if (Page_Uptodate(page
))
2007 ll_rw_block(rw
, nr
, arr
);
2009 if (!nr
&& rw
== READ
) {
2010 SetPageUptodate(page
);
2013 if (nr
&& (rw
== WRITE
))
2014 ll_rw_block(rw
, nr
, arr
);
2019 int block_symlink(struct inode
*inode
, const char *symname
, int len
)
2021 struct address_space
*mapping
= inode
->i_mapping
;
2022 struct page
*page
= grab_cache_page(mapping
, 0);
2028 err
= mapping
->a_ops
->prepare_write(NULL
, page
, 0, len
-1);
2031 kaddr
= (char*)page_address(page
);
2032 memcpy(kaddr
, symname
, len
-1);
2033 mapping
->a_ops
->commit_write(NULL
, page
, 0, len
-1);
2035 * Notice that we are _not_ going to block here - end of page is
2036 * unmapped, so this will only try to map the rest of page, see
2037 * that it is unmapped (typically even will not look into inode -
2038 * ->i_size will be enough for everything) and zero it out.
2039 * OTOH it's obviously correct and should make the page up-to-date.
2041 err
= mapping
->a_ops
->readpage(NULL
, page
);
2043 page_cache_release(page
);
2046 mark_inode_dirty(inode
);
2050 page_cache_release(page
);
2056 * Try to increase the number of buffers available: the size argument
2057 * is used to determine what kind of buffers we want.
2059 static int grow_buffers(int size
)
2062 struct buffer_head
*bh
, *tmp
;
2063 struct buffer_head
* insert_point
;
2066 if ((size
& 511) || (size
> PAGE_SIZE
)) {
2067 printk("VFS: grow_buffers: size = %d\n",size
);
2071 page
= alloc_page(GFP_BUFFER
);
2074 bh
= create_buffers(page
, size
, 0);
2076 goto no_buffer_head
;
2078 isize
= BUFSIZE_INDEX(size
);
2080 spin_lock(&free_list
[isize
].lock
);
2081 insert_point
= free_list
[isize
].list
;
2085 tmp
->b_next_free
= insert_point
->b_next_free
;
2086 tmp
->b_prev_free
= insert_point
;
2087 insert_point
->b_next_free
->b_prev_free
= tmp
;
2088 insert_point
->b_next_free
= tmp
;
2090 tmp
->b_prev_free
= tmp
;
2091 tmp
->b_next_free
= tmp
;
2094 if (tmp
->b_this_page
)
2095 tmp
= tmp
->b_this_page
;
2099 tmp
->b_this_page
= bh
;
2100 free_list
[isize
].list
= bh
;
2101 spin_unlock(&free_list
[isize
].lock
);
2104 lru_cache_add(page
);
2105 atomic_inc(&buffermem_pages
);
2109 page_cache_release(page
);
2115 * Sync all the buffers on one page..
2117 * If we have old buffers that are locked, we'll
2118 * wait on them, but we won't wait on the new ones
2119 * we're writing out now.
2121 * This all is required so that we can free up memory
2124 static void sync_page_buffers(struct buffer_head
*bh
)
2126 struct buffer_head
* tmp
;
2130 struct buffer_head
*p
= tmp
;
2131 tmp
= tmp
->b_this_page
;
2132 if (buffer_dirty(p
) && !buffer_locked(p
))
2133 ll_rw_block(WRITE
, 1, &p
);
2134 } while (tmp
!= bh
);
2138 * Can the buffer be thrown out?
2140 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2141 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2144 * try_to_free_buffers() checks if all the buffers on this particular page
2145 * are unused, and free's the page if so.
2147 * Wake up bdflush() if this fails - if we're running low on memory due
2148 * to dirty buffers, we need to flush them out as quickly as possible.
2150 * NOTE: There are quite a number of ways that threads of control can
2151 * obtain a reference to a buffer head within a page. So we must
2152 * lock out all of these paths to cleanly toss the page.
2154 int try_to_free_buffers(struct page
* page
)
2156 struct buffer_head
* tmp
, * bh
= page
->buffers
;
2157 int index
= BUFSIZE_INDEX(bh
->b_size
);
2159 spin_lock(&lru_list_lock
);
2160 write_lock(&hash_table_lock
);
2161 spin_lock(&free_list
[index
].lock
);
2164 struct buffer_head
*p
= tmp
;
2166 tmp
= tmp
->b_this_page
;
2168 goto busy_buffer_page
;
2169 } while (tmp
!= bh
);
2171 spin_lock(&unused_list_lock
);
2174 struct buffer_head
* p
= tmp
;
2175 tmp
= tmp
->b_this_page
;
2177 /* The buffer can be either on the regular
2178 * queues or on the free list..
2180 if (p
->b_dev
!= B_FREE
)
2181 __remove_from_queues(p
);
2183 __remove_from_free_list(p
, index
);
2184 __put_unused_buffer_head(p
);
2185 } while (tmp
!= bh
);
2186 spin_unlock(&unused_list_lock
);
2188 /* Wake up anyone waiting for buffer heads */
2189 wake_up(&buffer_wait
);
2191 /* And free the page */
2192 page
->buffers
= NULL
;
2193 page_cache_release(page
);
2194 spin_unlock(&free_list
[index
].lock
);
2195 write_unlock(&hash_table_lock
);
2196 spin_unlock(&lru_list_lock
);
2200 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2201 spin_unlock(&free_list
[index
].lock
);
2202 write_unlock(&hash_table_lock
);
2203 spin_unlock(&lru_list_lock
);
2204 sync_page_buffers(bh
);
2208 /* ================== Debugging =================== */
2210 void show_buffers(void)
2213 struct buffer_head
* bh
;
2214 int found
= 0, locked
= 0, dirty
= 0, used
= 0, lastused
= 0;
2217 static char *buf_types
[NR_LIST
] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2220 printk("Buffer memory: %6dkB\n",
2221 atomic_read(&buffermem_pages
) << (PAGE_SHIFT
-10));
2223 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2224 if (!spin_trylock(&lru_list_lock
))
2226 for(nlist
= 0; nlist
< NR_LIST
; nlist
++) {
2227 found
= locked
= dirty
= used
= lastused
= protected = 0;
2228 bh
= lru_list
[nlist
];
2233 if (buffer_locked(bh
))
2235 if (buffer_protected(bh
))
2237 if (buffer_dirty(bh
))
2239 if (atomic_read(&bh
->b_count
))
2240 used
++, lastused
= found
;
2241 bh
= bh
->b_next_free
;
2242 } while (bh
!= lru_list
[nlist
]);
2244 int tmp
= nr_buffers_type
[nlist
];
2246 printk("%9s: BUG -> found %d, reported %d\n",
2247 buf_types
[nlist
], found
, tmp
);
2249 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2250 "%d locked, %d protected, %d dirty\n",
2251 buf_types
[nlist
], found
, size_buffers_type
[nlist
]>>10,
2252 used
, lastused
, locked
, protected, dirty
);
2254 spin_unlock(&lru_list_lock
);
2258 /* ===================== Init ======================= */
2261 * allocate the hash table and init the free list
2262 * Use gfp() for the hash table to decrease TLB misses, use
2263 * SLAB cache for buffer heads.
2265 void __init
buffer_init(unsigned long mempages
)
2268 unsigned int nr_hash
;
2270 /* The buffer cache hash table is less important these days,
2275 mempages
*= sizeof(struct buffer_head
*);
2277 for (order
= 0; (1 << order
) < mempages
; order
++)
2280 /* try to allocate something until we get it or we're asking
2281 for something that is really too small */
2286 nr_hash
= (PAGE_SIZE
<< order
) / sizeof(struct buffer_head
*);
2287 bh_hash_mask
= (nr_hash
- 1);
2291 while((tmp
>>= 1UL) != 0UL)
2294 hash_table
= (struct buffer_head
**)
2295 __get_free_pages(GFP_ATOMIC
, order
);
2296 } while (hash_table
== NULL
&& --order
> 0);
2297 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2298 nr_hash
, order
, (PAGE_SIZE
<< order
));
2301 panic("Failed to allocate buffer hash table\n");
2303 /* Setup hash chains. */
2304 for(i
= 0; i
< nr_hash
; i
++)
2305 hash_table
[i
] = NULL
;
2307 /* Setup free lists. */
2308 for(i
= 0; i
< NR_SIZES
; i
++) {
2309 free_list
[i
].list
= NULL
;
2310 free_list
[i
].lock
= SPIN_LOCK_UNLOCKED
;
2313 /* Setup lru lists. */
2314 for(i
= 0; i
< NR_LIST
; i
++)
2317 bh_cachep
= kmem_cache_create("buffer_head",
2318 sizeof(struct buffer_head
),
2320 SLAB_HWCACHE_ALIGN
, NULL
, NULL
);
2322 panic("Cannot create buffer head SLAB cache\n");
2326 /* ====================== bdflush support =================== */
2328 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2329 * response to dirty buffers. Once this process is activated, we write back
2330 * a limited number of buffers to the disks and then go back to sleep again.
2332 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done
);
2333 struct task_struct
*bdflush_tsk
= 0;
2335 void wakeup_bdflush(int block
)
2337 DECLARE_WAITQUEUE(wait
, current
);
2339 if (current
== bdflush_tsk
)
2343 wake_up_process(bdflush_tsk
);
2347 /* kflushd can wakeup us before we have a chance to
2348 go to sleep so we must be smart in handling
2349 this wakeup event from kflushd to avoid deadlocking in SMP
2350 (we are not holding any lock anymore in these two paths). */
2351 __set_current_state(TASK_UNINTERRUPTIBLE
);
2352 add_wait_queue(&bdflush_done
, &wait
);
2354 wake_up_process(bdflush_tsk
);
2357 remove_wait_queue(&bdflush_done
, &wait
);
2358 __set_current_state(TASK_RUNNING
);
2361 /* This is the _only_ function that deals with flushing async writes
2363 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2364 as all dirty buffers lives _only_ in the DIRTY lru list.
2365 As we never browse the LOCKED and CLEAN lru lists they are infact
2366 completly useless. */
2367 static int flush_dirty_buffers(int check_flushtime
)
2369 struct buffer_head
* bh
, *next
;
2373 spin_lock(&lru_list_lock
);
2374 bh
= lru_list
[BUF_DIRTY
];
2377 for (i
= nr_buffers_type
[BUF_DIRTY
]; i
-- > 0; bh
= next
) {
2378 next
= bh
->b_next_free
;
2380 if (!buffer_dirty(bh
)) {
2381 __refile_buffer(bh
);
2384 if (buffer_locked(bh
))
2387 if (check_flushtime
) {
2388 /* The dirty lru list is chronologically ordered so
2389 if the current bh is not yet timed out,
2390 then also all the following bhs
2391 will be too young. */
2392 if (time_before(jiffies
, bh
->b_flushtime
))
2395 if (++flushed
> bdf_prm
.b_un
.ndirty
)
2399 /* OK, now we are committed to write it out. */
2400 atomic_inc(&bh
->b_count
);
2401 spin_unlock(&lru_list_lock
);
2402 ll_rw_block(WRITE
, 1, &bh
);
2403 atomic_dec(&bh
->b_count
);
2405 if (current
->need_resched
)
2410 spin_unlock(&lru_list_lock
);
2416 * Here we attempt to write back old buffers. We also try to flush inodes
2417 * and supers as well, since this function is essentially "update", and
2418 * otherwise there would be no way of ensuring that these quantities ever
2419 * get written back. Ideally, we would have a timestamp on the inodes
2420 * and superblocks so that we could write back only the old ones as well
2423 static int sync_old_buffers(void)
2430 flush_dirty_buffers(1);
2431 /* must really sync all the active I/O request to disk here */
2432 run_task_queue(&tq_disk
);
2436 int block_sync_page(struct page
*page
)
2438 run_task_queue(&tq_disk
);
2442 /* This is the interface to bdflush. As we get more sophisticated, we can
2443 * pass tuning parameters to this "process", to adjust how it behaves.
2444 * We would want to verify each parameter, however, to make sure that it
2447 asmlinkage
long sys_bdflush(int func
, long data
)
2449 if (!capable(CAP_SYS_ADMIN
))
2453 /* do_exit directly and let kupdate to do its work alone. */
2455 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2456 a syscall that doesn't care about the current mm context. */
2458 struct mm_struct
*user_mm
;
2461 * bdflush will spend all of it's time in kernel-space,
2462 * without touching user-space, so we can switch it into
2463 * 'lazy TLB mode' to reduce the cost of context-switches
2464 * to and from bdflush.
2466 user_mm
= start_lazy_tlb();
2467 error
= sync_old_buffers();
2468 end_lazy_tlb(user_mm
);
2473 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2475 int i
= (func
-2) >> 1;
2476 if (i
>= 0 && i
< N_PARAM
) {
2477 if ((func
& 1) == 0)
2478 return put_user(bdf_prm
.data
[i
], (int*)data
);
2480 if (data
>= bdflush_min
[i
] && data
<= bdflush_max
[i
]) {
2481 bdf_prm
.data
[i
] = data
;
2488 /* Having func 0 used to launch the actual bdflush and then never
2489 * return (unless explicitly killed). We return zero here to
2490 * remain semi-compatible with present update(8) programs.
2496 * This is the actual bdflush daemon itself. It used to be started from
2497 * the syscall above, but now we launch it ourselves internally with
2498 * kernel_thread(...) directly after the first thread in init/main.c
2500 int bdflush(void * unused
)
2502 struct task_struct
*tsk
= current
;
2505 * We have a bare-bones task_struct, and really should fill
2506 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2507 * display semi-sane things. Not real crucial though...
2512 strcpy(tsk
->comm
, "kflushd");
2515 /* avoid getting signals */
2516 spin_lock_irq(&tsk
->sigmask_lock
);
2518 sigfillset(&tsk
->blocked
);
2519 recalc_sigpending(tsk
);
2520 spin_unlock_irq(&tsk
->sigmask_lock
);
2523 CHECK_EMERGENCY_SYNC
2525 flushed
= flush_dirty_buffers(0);
2527 /* If wakeup_bdflush will wakeup us
2528 after our bdflush_done wakeup, then
2529 we must make sure to not sleep
2530 in schedule_timeout otherwise
2531 wakeup_bdflush may wait for our
2532 bdflush_done wakeup that would never arrive
2533 (as we would be sleeping) and so it would
2535 __set_current_state(TASK_INTERRUPTIBLE
);
2536 wake_up(&bdflush_done
);
2538 * If there are still a lot of dirty buffers around,
2539 * skip the sleep and flush some more. Otherwise, we
2540 * go to sleep waiting a wakeup.
2542 if (!flushed
|| balance_dirty_state(NODEV
) < 0)
2544 /* Remember to mark us as running otherwise
2545 the next schedule will block. */
2546 __set_current_state(TASK_RUNNING
);
2551 * This is the kernel update daemon. It was used to live in userspace
2552 * but since it's need to run safely we want it unkillable by mistake.
2553 * You don't need to change your userspace configuration since
2554 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2556 int kupdate(void * unused
)
2558 struct task_struct
* tsk
= current
;
2563 strcpy(tsk
->comm
, "kupdate");
2565 /* sigstop and sigcont will stop and wakeup kupdate */
2566 spin_lock_irq(&tsk
->sigmask_lock
);
2567 sigfillset(&tsk
->blocked
);
2568 siginitsetinv(¤t
->blocked
, sigmask(SIGCONT
) | sigmask(SIGSTOP
));
2569 recalc_sigpending(tsk
);
2570 spin_unlock_irq(&tsk
->sigmask_lock
);
2573 /* update interval */
2574 interval
= bdf_prm
.b_un
.interval
;
2576 tsk
->state
= TASK_INTERRUPTIBLE
;
2577 schedule_timeout(interval
);
2580 tsk
->state
= TASK_STOPPED
;
2581 schedule(); /* wait for SIGCONT */
2583 /* check for sigstop */
2584 if (signal_pending(tsk
)) {
2586 spin_lock_irq(&tsk
->sigmask_lock
);
2587 if (sigismember(&tsk
->signal
, SIGSTOP
)) {
2588 sigdelset(&tsk
->signal
, SIGSTOP
);
2591 recalc_sigpending(tsk
);
2592 spin_unlock_irq(&tsk
->sigmask_lock
);
2597 printk("kupdate() activated...\n");
2603 static int __init
bdflush_init(void)
2605 kernel_thread(bdflush
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
);
2606 kernel_thread(kupdate
, NULL
, CLONE_FS
| CLONE_FILES
| CLONE_SIGHAND
);
2610 module_init(bdflush_init
)