Import 2.4.0-test5pre5
[davej-history.git] / fs / buffer.c
blob8b5d19bda48d60e43e11cb354ea50b7a011111ac
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
53 #define NR_SIZES 7
54 static char buffersize_index[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 6};
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
72 * Hash table gook..
74 static unsigned int bh_hash_mask;
75 static unsigned int bh_hash_shift;
76 static struct buffer_head **hash_table;
77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
79 static struct buffer_head *lru_list[NR_LIST];
80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
81 static int nr_buffers_type[NR_LIST];
82 static unsigned long size_buffers_type[NR_LIST];
84 static struct buffer_head * unused_list;
85 static int nr_unused_buffer_heads;
86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
89 struct bh_free_head {
90 struct buffer_head *list;
91 spinlock_t lock;
93 static struct bh_free_head free_list[NR_SIZES];
95 kmem_cache_t *bh_cachep;
97 static int grow_buffers(int size);
98 static void __refile_buffer(struct buffer_head *);
100 /* This is used by some architectures to estimate available memory. */
101 atomic_t buffermem_pages = ATOMIC_INIT(0);
103 /* Here is the parameter block for the bdflush process. If you add or
104 * remove any of the parameters, make sure to update kernel/sysctl.c.
107 #define N_PARAM 9
109 /* The dummy values in this structure are left in there for compatibility
110 * with old programs that play with the /proc entries.
112 union bdflush_param {
113 struct {
114 int nfract; /* Percentage of buffer cache dirty to
115 activate bdflush */
116 int ndirty; /* Maximum number of dirty blocks to write out per
117 wake-cycle */
118 int nrefill; /* Number of clean buffers to try to obtain
119 each time we call refill */
120 int nref_dirt; /* Dirty buffer threshold for activating bdflush
121 when trying to refill buffers. */
122 int interval; /* jiffies delay between kupdate flushes */
123 int age_buffer; /* Time for normal buffer to age before we flush it */
124 int age_super; /* Time for superblock to age before we flush it */
125 int dummy2; /* unused */
126 int dummy3; /* unused */
127 } b_un;
128 unsigned int data[N_PARAM];
129 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
131 /* These are the min and max parameter values that we will allow to be assigned */
132 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
133 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
136 * Rewrote the wait-routines to use the "new" wait-queue functionality,
137 * and getting rid of the cli-sti pairs. The wait-queue routines still
138 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 * Note that the real wait_on_buffer() is an inline function that checks
141 * if 'b_wait' is set before calling this, so that the queues aren't set
142 * up unnecessarily.
144 void __wait_on_buffer(struct buffer_head * bh)
146 struct task_struct *tsk = current;
147 DECLARE_WAITQUEUE(wait, tsk);
149 atomic_inc(&bh->b_count);
150 add_wait_queue(&bh->b_wait, &wait);
151 do {
152 run_task_queue(&tq_disk);
153 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
154 if (!buffer_locked(bh))
155 break;
156 schedule();
157 } while (buffer_locked(bh));
158 tsk->state = TASK_RUNNING;
159 remove_wait_queue(&bh->b_wait, &wait);
160 atomic_dec(&bh->b_count);
163 /* Call sync_buffers with wait!=0 to ensure that the call does not
164 * return until all buffer writes have completed. Sync() may return
165 * before the writes have finished; fsync() may not.
168 /* Godamity-damn. Some buffers (bitmaps for filesystems)
169 * spontaneously dirty themselves without ever brelse being called.
170 * We will ultimately want to put these in a separate list, but for
171 * now we search all of the lists for dirty buffers.
173 static int sync_buffers(kdev_t dev, int wait)
175 int i, retry, pass = 0, err = 0;
176 struct buffer_head * bh, *next;
178 /* One pass for no-wait, three for wait:
179 * 0) write out all dirty, unlocked buffers;
180 * 1) write out all dirty buffers, waiting if locked;
181 * 2) wait for completion by waiting for all buffers to unlock.
183 do {
184 retry = 0;
186 /* We search all lists as a failsafe mechanism, not because we expect
187 * there to be dirty buffers on any of the other lists.
189 repeat:
190 spin_lock(&lru_list_lock);
191 bh = lru_list[BUF_DIRTY];
192 if (!bh)
193 goto repeat2;
195 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
196 next = bh->b_next_free;
198 if (!lru_list[BUF_DIRTY])
199 break;
200 if (dev && bh->b_dev != dev)
201 continue;
202 if (buffer_locked(bh)) {
203 /* Buffer is locked; skip it unless wait is
204 * requested AND pass > 0.
206 if (!wait || !pass) {
207 retry = 1;
208 continue;
210 atomic_inc(&bh->b_count);
211 spin_unlock(&lru_list_lock);
212 wait_on_buffer (bh);
213 atomic_dec(&bh->b_count);
214 goto repeat;
217 /* If an unlocked buffer is not uptodate, there has
218 * been an IO error. Skip it.
220 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
221 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
222 err = -EIO;
223 continue;
226 /* Don't write clean buffers. Don't write ANY buffers
227 * on the third pass.
229 if (!buffer_dirty(bh) || pass >= 2)
230 continue;
232 atomic_inc(&bh->b_count);
233 spin_unlock(&lru_list_lock);
234 ll_rw_block(WRITE, 1, &bh);
235 atomic_dec(&bh->b_count);
236 retry = 1;
237 goto repeat;
240 repeat2:
241 bh = lru_list[BUF_LOCKED];
242 if (!bh) {
243 spin_unlock(&lru_list_lock);
244 break;
246 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
247 next = bh->b_next_free;
249 if (!lru_list[BUF_LOCKED])
250 break;
251 if (dev && bh->b_dev != dev)
252 continue;
253 if (buffer_locked(bh)) {
254 /* Buffer is locked; skip it unless wait is
255 * requested AND pass > 0.
257 if (!wait || !pass) {
258 retry = 1;
259 continue;
261 atomic_inc(&bh->b_count);
262 spin_unlock(&lru_list_lock);
263 wait_on_buffer (bh);
264 spin_lock(&lru_list_lock);
265 atomic_dec(&bh->b_count);
266 goto repeat2;
269 spin_unlock(&lru_list_lock);
271 /* If we are waiting for the sync to succeed, and if any dirty
272 * blocks were written, then repeat; on the second pass, only
273 * wait for buffers being written (do not pass to write any
274 * more buffers on the second pass).
276 } while (wait && retry && ++pass<=2);
277 return err;
280 void sync_dev(kdev_t dev)
282 sync_supers(dev);
283 sync_inodes(dev);
284 DQUOT_SYNC(dev);
285 /* sync all the dirty buffers out to disk only _after_ all the
286 high level layers finished generated buffer dirty data
287 (or we'll return with some buffer still dirty on the blockdevice
288 so breaking the semantics of this call) */
289 sync_buffers(dev, 0);
291 * FIXME(eric) we need to sync the physical devices here.
292 * This is because some (scsi) controllers have huge amounts of
293 * cache onboard (hundreds of Mb), and we need to instruct
294 * them to commit all of the dirty memory to disk, and we should
295 * not return until this has happened.
297 * This would need to get implemented by going through the assorted
298 * layers so that each block major number can be synced, and this
299 * would call down into the upper and mid-layer scsi.
303 int fsync_dev(kdev_t dev)
305 sync_buffers(dev, 0);
307 lock_kernel();
308 sync_supers(dev);
309 sync_inodes(dev);
310 DQUOT_SYNC(dev);
311 unlock_kernel();
313 return sync_buffers(dev, 1);
316 asmlinkage long sys_sync(void)
318 fsync_dev(0);
319 return 0;
323 * filp may be NULL if called via the msync of a vma.
326 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
328 struct inode * inode = dentry->d_inode;
329 struct super_block * sb;
330 kdev_t dev;
331 int ret;
333 lock_kernel();
334 /* sync the inode to buffers */
335 write_inode_now(inode, 0);
337 /* sync the superblock to buffers */
338 sb = inode->i_sb;
339 wait_on_super(sb);
340 if (sb->s_op && sb->s_op->write_super)
341 sb->s_op->write_super(sb);
343 /* .. finally sync the buffers to disk */
344 dev = inode->i_dev;
345 ret = sync_buffers(dev, 1);
346 unlock_kernel();
347 return ret;
350 asmlinkage long sys_fsync(unsigned int fd)
352 struct file * file;
353 struct dentry * dentry;
354 struct inode * inode;
355 int err;
357 err = -EBADF;
358 file = fget(fd);
359 if (!file)
360 goto out;
362 dentry = file->f_dentry;
363 inode = dentry->d_inode;
365 err = -EINVAL;
366 if (!file->f_op || !file->f_op->fsync)
367 goto out_putf;
369 /* We need to protect against concurrent writers.. */
370 down(&inode->i_sem);
371 err = file->f_op->fsync(file, dentry, 0);
372 up(&inode->i_sem);
374 out_putf:
375 fput(file);
376 out:
377 return err;
380 asmlinkage long sys_fdatasync(unsigned int fd)
382 struct file * file;
383 struct dentry * dentry;
384 struct inode * inode;
385 int err;
387 err = -EBADF;
388 file = fget(fd);
389 if (!file)
390 goto out;
392 dentry = file->f_dentry;
393 inode = dentry->d_inode;
395 err = -EINVAL;
396 if (!file->f_op || !file->f_op->fsync)
397 goto out_putf;
399 down(&inode->i_sem);
400 err = file->f_op->fsync(file, dentry, 1);
401 up(&inode->i_sem);
403 out_putf:
404 fput(file);
405 out:
406 return err;
409 /* After several hours of tedious analysis, the following hash
410 * function won. Do not mess with it... -DaveM
412 #define _hashfn(dev,block) \
413 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
414 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
415 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
417 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
419 if ((bh->b_next = *head) != NULL)
420 bh->b_next->b_pprev = &bh->b_next;
421 *head = bh;
422 bh->b_pprev = head;
425 static __inline__ void __hash_unlink(struct buffer_head *bh)
427 if (bh->b_pprev) {
428 if (bh->b_next)
429 bh->b_next->b_pprev = bh->b_pprev;
430 *(bh->b_pprev) = bh->b_next;
431 bh->b_pprev = NULL;
435 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
437 struct buffer_head **bhp = &lru_list[blist];
439 if(!*bhp) {
440 *bhp = bh;
441 bh->b_prev_free = bh;
443 bh->b_next_free = *bhp;
444 bh->b_prev_free = (*bhp)->b_prev_free;
445 (*bhp)->b_prev_free->b_next_free = bh;
446 (*bhp)->b_prev_free = bh;
447 nr_buffers_type[blist]++;
448 size_buffers_type[blist] += bh->b_size;
451 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
453 if (bh->b_prev_free || bh->b_next_free) {
454 bh->b_prev_free->b_next_free = bh->b_next_free;
455 bh->b_next_free->b_prev_free = bh->b_prev_free;
456 if (lru_list[blist] == bh)
457 lru_list[blist] = bh->b_next_free;
458 if (lru_list[blist] == bh)
459 lru_list[blist] = NULL;
460 bh->b_next_free = bh->b_prev_free = NULL;
461 nr_buffers_type[blist]--;
462 size_buffers_type[blist] -= bh->b_size;
466 static void __remove_from_free_list(struct buffer_head * bh, int index)
468 if(bh->b_next_free == bh)
469 free_list[index].list = NULL;
470 else {
471 bh->b_prev_free->b_next_free = bh->b_next_free;
472 bh->b_next_free->b_prev_free = bh->b_prev_free;
473 if (free_list[index].list == bh)
474 free_list[index].list = bh->b_next_free;
476 bh->b_next_free = bh->b_prev_free = NULL;
479 /* must be called with both the hash_table_lock and the lru_list_lock
480 held */
481 static void __remove_from_queues(struct buffer_head *bh)
483 __hash_unlink(bh);
484 __remove_from_lru_list(bh, bh->b_list);
487 static void insert_into_queues(struct buffer_head *bh)
489 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
491 spin_lock(&lru_list_lock);
492 write_lock(&hash_table_lock);
493 __hash_link(bh, head);
494 __insert_into_lru_list(bh, bh->b_list);
495 write_unlock(&hash_table_lock);
496 spin_unlock(&lru_list_lock);
499 /* This function must only run if there are no other
500 * references _anywhere_ to this buffer head.
502 static void put_last_free(struct buffer_head * bh)
504 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
505 struct buffer_head **bhp = &head->list;
507 bh->b_state = 0;
509 spin_lock(&head->lock);
510 bh->b_dev = B_FREE;
511 if(!*bhp) {
512 *bhp = bh;
513 bh->b_prev_free = bh;
515 bh->b_next_free = *bhp;
516 bh->b_prev_free = (*bhp)->b_prev_free;
517 (*bhp)->b_prev_free->b_next_free = bh;
518 (*bhp)->b_prev_free = bh;
519 spin_unlock(&head->lock);
523 * Why like this, I hear you say... The reason is race-conditions.
524 * As we don't lock buffers (unless we are reading them, that is),
525 * something might happen to it while we sleep (ie a read-error
526 * will force it bad). This shouldn't really happen currently, but
527 * the code is ready.
529 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
531 struct buffer_head **head = &hash(dev, block);
532 struct buffer_head *bh;
534 read_lock(&hash_table_lock);
535 for(bh = *head; bh; bh = bh->b_next)
536 if (bh->b_blocknr == block &&
537 bh->b_size == size &&
538 bh->b_dev == dev)
539 break;
540 if (bh)
541 atomic_inc(&bh->b_count);
542 read_unlock(&hash_table_lock);
544 return bh;
547 unsigned int get_hardblocksize(kdev_t dev)
550 * Get the hard sector size for the given device. If we don't know
551 * what it is, return 0.
553 if (hardsect_size[MAJOR(dev)] != NULL) {
554 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
555 if (blksize != 0)
556 return blksize;
560 * We don't know what the hardware sector size for this device is.
561 * Return 0 indicating that we don't know.
563 return 0;
566 /* If invalidate_buffers() will trash dirty buffers, it means some kind
567 of fs corruption is going on. Trashing dirty data always imply losing
568 information that was supposed to be just stored on the physical layer
569 by the user.
571 Thus invalidate_buffers in general usage is not allwowed to trash dirty
572 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
574 NOTE: In the case where the user removed a removable-media-disk even if
575 there's still dirty data not synced on disk (due a bug in the device driver
576 or due an error of the user), by not destroying the dirty buffers we could
577 generate corruption also on the next media inserted, thus a parameter is
578 necessary to handle this case in the most safe way possible (trying
579 to not corrupt also the new disk inserted with the data belonging to
580 the old now corrupted disk). Also for the ramdisk the natural thing
581 to do in order to release the ramdisk memory is to destroy dirty buffers.
583 These are two special cases. Normal usage imply the device driver
584 to issue a sync on the device (without waiting I/O completation) and
585 then an invalidate_buffers call that doesn't trashes dirty buffers. */
586 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
588 int i, nlist, slept;
589 struct buffer_head * bh, * bh_next;
591 retry:
592 slept = 0;
593 spin_lock(&lru_list_lock);
594 for(nlist = 0; nlist < NR_LIST; nlist++) {
595 bh = lru_list[nlist];
596 if (!bh)
597 continue;
598 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
599 bh_next = bh->b_next_free;
600 if (bh->b_dev != dev)
601 continue;
602 if (buffer_locked(bh)) {
603 atomic_inc(&bh->b_count);
604 spin_unlock(&lru_list_lock);
605 wait_on_buffer(bh);
606 slept = 1;
607 spin_lock(&lru_list_lock);
608 atomic_dec(&bh->b_count);
611 write_lock(&hash_table_lock);
612 if (!atomic_read(&bh->b_count) &&
613 (destroy_dirty_buffers || !buffer_dirty(bh))) {
614 __remove_from_queues(bh);
615 put_last_free(bh);
617 write_unlock(&hash_table_lock);
618 if (slept)
619 goto out;
622 out:
623 spin_unlock(&lru_list_lock);
624 if (slept)
625 goto retry;
628 void set_blocksize(kdev_t dev, int size)
630 extern int *blksize_size[];
631 int i, nlist, slept;
632 struct buffer_head * bh, * bh_next;
634 if (!blksize_size[MAJOR(dev)])
635 return;
637 /* Size must be a power of two, and between 512 and PAGE_SIZE */
638 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
639 panic("Invalid blocksize passed to set_blocksize");
641 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
642 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
643 return;
645 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
646 return;
647 sync_buffers(dev, 2);
648 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
650 retry:
651 slept = 0;
652 spin_lock(&lru_list_lock);
653 for(nlist = 0; nlist < NR_LIST; nlist++) {
654 bh = lru_list[nlist];
655 if (!bh)
656 continue;
657 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
658 bh_next = bh->b_next_free;
659 if (bh->b_dev != dev || bh->b_size == size)
660 continue;
661 if (buffer_locked(bh)) {
662 atomic_inc(&bh->b_count);
663 spin_unlock(&lru_list_lock);
664 wait_on_buffer(bh);
665 slept = 1;
666 spin_lock(&lru_list_lock);
667 atomic_dec(&bh->b_count);
670 write_lock(&hash_table_lock);
671 if (!atomic_read(&bh->b_count)) {
672 if (buffer_dirty(bh))
673 printk(KERN_WARNING
674 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
675 kdevname(dev), bh->b_blocknr, bh->b_size);
676 __remove_from_queues(bh);
677 put_last_free(bh);
678 } else {
679 if (atomic_set_buffer_clean(bh))
680 __refile_buffer(bh);
681 clear_bit(BH_Uptodate, &bh->b_state);
682 printk(KERN_WARNING
683 "set_blocksize: "
684 "b_count %d, dev %s, block %lu, from %p\n",
685 atomic_read(&bh->b_count), bdevname(bh->b_dev),
686 bh->b_blocknr, __builtin_return_address(0));
688 write_unlock(&hash_table_lock);
689 if (slept)
690 goto out;
693 out:
694 spin_unlock(&lru_list_lock);
695 if (slept)
696 goto retry;
700 * We used to try various strange things. Let's not.
702 static void refill_freelist(int size)
704 if (!grow_buffers(size)) {
705 wakeup_bdflush(1);
706 current->policy |= SCHED_YIELD;
707 schedule();
711 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
713 bh->b_list = BUF_CLEAN;
714 bh->b_end_io = handler;
715 bh->b_private = private;
718 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
720 mark_buffer_uptodate(bh, uptodate);
721 unlock_buffer(bh);
724 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
726 mark_buffer_uptodate(bh, uptodate);
727 unlock_buffer(bh);
728 BUG();
731 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
733 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
734 unsigned long flags;
735 struct buffer_head *tmp;
736 struct page *page;
738 mark_buffer_uptodate(bh, uptodate);
740 /* This is a temporary buffer used for page I/O. */
741 page = bh->b_page;
743 if (!uptodate)
744 SetPageError(page);
747 * Be _very_ careful from here on. Bad things can happen if
748 * two buffer heads end IO at almost the same time and both
749 * decide that the page is now completely done.
751 * Async buffer_heads are here only as labels for IO, and get
752 * thrown away once the IO for this page is complete. IO is
753 * deemed complete once all buffers have been visited
754 * (b_count==0) and are now unlocked. We must make sure that
755 * only the _last_ buffer that decrements its count is the one
756 * that unlock the page..
758 spin_lock_irqsave(&page_uptodate_lock, flags);
759 unlock_buffer(bh);
760 atomic_dec(&bh->b_count);
761 tmp = bh->b_this_page;
762 while (tmp != bh) {
763 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
764 goto still_busy;
765 tmp = tmp->b_this_page;
768 /* OK, the async IO on this page is complete. */
769 spin_unlock_irqrestore(&page_uptodate_lock, flags);
772 * if none of the buffers had errors then we can set the
773 * page uptodate:
775 if (!PageError(page))
776 SetPageUptodate(page);
779 * Run the hooks that have to be done when a page I/O has completed.
781 if (PageTestandClearDecrAfter(page))
782 atomic_dec(&nr_async_pages);
784 UnlockPage(page);
786 return;
788 still_busy:
789 spin_unlock_irqrestore(&page_uptodate_lock, flags);
790 return;
794 * Ok, this is getblk, and it isn't very clear, again to hinder
795 * race-conditions. Most of the code is seldom used, (ie repeating),
796 * so it should be much more efficient than it looks.
798 * The algorithm is changed: hopefully better, and an elusive bug removed.
800 * 14.02.92: changed it to sync dirty buffers a bit: better performance
801 * when the filesystem starts to get full of dirty blocks (I hope).
803 struct buffer_head * getblk(kdev_t dev, int block, int size)
805 struct buffer_head * bh;
806 int isize;
808 repeat:
809 bh = get_hash_table(dev, block, size);
810 if (bh)
811 goto out;
813 isize = BUFSIZE_INDEX(size);
814 spin_lock(&free_list[isize].lock);
815 bh = free_list[isize].list;
816 if (bh) {
817 __remove_from_free_list(bh, isize);
818 atomic_set(&bh->b_count, 1);
820 spin_unlock(&free_list[isize].lock);
823 * OK, FINALLY we know that this buffer is the only one of
824 * its kind, we hold a reference (b_count>0), it is unlocked,
825 * and it is clean.
827 if (bh) {
828 init_buffer(bh, end_buffer_io_sync, NULL);
829 bh->b_dev = dev;
830 bh->b_blocknr = block;
831 bh->b_state = 1 << BH_Mapped;
833 /* Insert the buffer into the regular lists */
834 insert_into_queues(bh);
835 out:
836 touch_buffer(bh);
837 return bh;
841 * If we block while refilling the free list, somebody may
842 * create the buffer first ... search the hashes again.
844 refill_freelist(size);
845 goto repeat;
848 /* -1 -> no need to flush
849 0 -> async flush
850 1 -> sync flush (wait for I/O completation) */
851 static int balance_dirty_state(kdev_t dev)
853 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
855 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
856 tot = nr_free_buffer_pages();
857 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
859 dirty *= 200;
860 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
861 hard_dirty_limit = soft_dirty_limit * 2;
863 if (dirty > soft_dirty_limit) {
864 if (dirty > hard_dirty_limit)
865 return 1;
866 return 0;
868 return -1;
872 * if a new dirty buffer is created we need to balance bdflush.
874 * in the future we might want to make bdflush aware of different
875 * pressures on different devices - thus the (currently unused)
876 * 'dev' parameter.
878 void balance_dirty(kdev_t dev)
880 int state = balance_dirty_state(dev);
882 if (state < 0)
883 return;
884 wakeup_bdflush(state);
887 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
889 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
890 refile_buffer(bh);
893 /* atomic version, the user must call balance_dirty() by hand
894 as soon as it become possible to block */
895 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
897 if (!atomic_set_buffer_dirty(bh))
898 __mark_dirty(bh, flag);
901 void mark_buffer_dirty(struct buffer_head *bh, int flag)
903 __mark_buffer_dirty(bh, flag);
904 balance_dirty(bh->b_dev);
908 * A buffer may need to be moved from one buffer list to another
909 * (e.g. in case it is not shared any more). Handle this.
911 static void __refile_buffer(struct buffer_head *bh)
913 int dispose = BUF_CLEAN;
914 if (buffer_locked(bh))
915 dispose = BUF_LOCKED;
916 if (buffer_dirty(bh))
917 dispose = BUF_DIRTY;
918 if (buffer_protected(bh))
919 dispose = BUF_PROTECTED;
920 if (dispose != bh->b_list) {
921 __remove_from_lru_list(bh, bh->b_list);
922 bh->b_list = dispose;
923 __insert_into_lru_list(bh, dispose);
927 void refile_buffer(struct buffer_head *bh)
929 spin_lock(&lru_list_lock);
930 __refile_buffer(bh);
931 spin_unlock(&lru_list_lock);
935 * Release a buffer head
937 void __brelse(struct buffer_head * buf)
939 if (atomic_read(&buf->b_count)) {
940 atomic_dec(&buf->b_count);
941 return;
943 printk("VFS: brelse: Trying to free free buffer\n");
947 * bforget() is like brelse(), except it puts the buffer on the
948 * free list if it can.. We can NOT free the buffer if:
949 * - there are other users of it
950 * - it is locked and thus can have active IO
952 void __bforget(struct buffer_head * buf)
954 /* grab the lru lock here to block bdflush. */
955 spin_lock(&lru_list_lock);
956 write_lock(&hash_table_lock);
957 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
958 goto in_use;
959 __hash_unlink(buf);
960 write_unlock(&hash_table_lock);
961 __remove_from_lru_list(buf, buf->b_list);
962 spin_unlock(&lru_list_lock);
963 put_last_free(buf);
964 return;
966 in_use:
967 write_unlock(&hash_table_lock);
968 spin_unlock(&lru_list_lock);
972 * bread() reads a specified block and returns the buffer that contains
973 * it. It returns NULL if the block was unreadable.
975 struct buffer_head * bread(kdev_t dev, int block, int size)
977 struct buffer_head * bh;
979 bh = getblk(dev, block, size);
980 if (buffer_uptodate(bh))
981 return bh;
982 ll_rw_block(READ, 1, &bh);
983 wait_on_buffer(bh);
984 if (buffer_uptodate(bh))
985 return bh;
986 brelse(bh);
987 return NULL;
991 * Ok, breada can be used as bread, but additionally to mark other
992 * blocks for reading as well. End the argument list with a negative
993 * number.
996 #define NBUF 16
998 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
999 unsigned int pos, unsigned int filesize)
1001 struct buffer_head * bhlist[NBUF];
1002 unsigned int blocks;
1003 struct buffer_head * bh;
1004 int index;
1005 int i, j;
1007 if (pos >= filesize)
1008 return NULL;
1010 if (block < 0)
1011 return NULL;
1013 bh = getblk(dev, block, bufsize);
1014 index = BUFSIZE_INDEX(bh->b_size);
1016 if (buffer_uptodate(bh))
1017 return(bh);
1018 else ll_rw_block(READ, 1, &bh);
1020 blocks = (filesize - pos) >> (9+index);
1022 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1023 blocks = read_ahead[MAJOR(dev)] >> index;
1024 if (blocks > NBUF)
1025 blocks = NBUF;
1027 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1029 bhlist[0] = bh;
1030 j = 1;
1031 for(i=1; i<blocks; i++) {
1032 bh = getblk(dev,block+i,bufsize);
1033 if (buffer_uptodate(bh)) {
1034 brelse(bh);
1035 break;
1037 else bhlist[j++] = bh;
1040 /* Request the read for these buffers, and then release them. */
1041 if (j>1)
1042 ll_rw_block(READA, (j-1), bhlist+1);
1043 for(i=1; i<j; i++)
1044 brelse(bhlist[i]);
1046 /* Wait for this buffer, and then continue on. */
1047 bh = bhlist[0];
1048 wait_on_buffer(bh);
1049 if (buffer_uptodate(bh))
1050 return bh;
1051 brelse(bh);
1052 return NULL;
1056 * Note: the caller should wake up the buffer_wait list if needed.
1058 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1060 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1061 kmem_cache_free(bh_cachep, bh);
1062 } else {
1063 bh->b_blocknr = -1;
1064 init_waitqueue_head(&bh->b_wait);
1065 nr_unused_buffer_heads++;
1066 bh->b_next_free = unused_list;
1067 bh->b_this_page = NULL;
1068 unused_list = bh;
1073 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1074 * no-buffer-head deadlock. Return NULL on failure; waiting for
1075 * buffer heads is now handled in create_buffers().
1077 static struct buffer_head * get_unused_buffer_head(int async)
1079 struct buffer_head * bh;
1081 spin_lock(&unused_list_lock);
1082 if (nr_unused_buffer_heads > NR_RESERVED) {
1083 bh = unused_list;
1084 unused_list = bh->b_next_free;
1085 nr_unused_buffer_heads--;
1086 spin_unlock(&unused_list_lock);
1087 return bh;
1089 spin_unlock(&unused_list_lock);
1091 /* This is critical. We can't swap out pages to get
1092 * more buffer heads, because the swap-out may need
1093 * more buffer-heads itself. Thus SLAB_BUFFER.
1095 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1096 memset(bh, 0, sizeof(*bh));
1097 init_waitqueue_head(&bh->b_wait);
1098 return bh;
1102 * If we need an async buffer, use the reserved buffer heads.
1104 if (async) {
1105 spin_lock(&unused_list_lock);
1106 if (unused_list) {
1107 bh = unused_list;
1108 unused_list = bh->b_next_free;
1109 nr_unused_buffer_heads--;
1110 spin_unlock(&unused_list_lock);
1111 return bh;
1113 spin_unlock(&unused_list_lock);
1115 #if 0
1117 * (Pending further analysis ...)
1118 * Ordinary (non-async) requests can use a different memory priority
1119 * to free up pages. Any swapping thus generated will use async
1120 * buffer heads.
1122 if(!async &&
1123 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1124 memset(bh, 0, sizeof(*bh));
1125 init_waitqueue_head(&bh->b_wait);
1126 return bh;
1128 #endif
1130 return NULL;
1133 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1135 bh->b_page = page;
1136 if (offset >= PAGE_SIZE)
1137 BUG();
1138 if (PageHighMem(page))
1140 * This catches illegal uses and preserves the offset:
1142 bh->b_data = (char *)(0 + offset);
1143 else
1144 bh->b_data = (char *)(page_address(page) + offset);
1148 * Create the appropriate buffers when given a page for data area and
1149 * the size of each buffer.. Use the bh->b_this_page linked list to
1150 * follow the buffers created. Return NULL if unable to create more
1151 * buffers.
1152 * The async flag is used to differentiate async IO (paging, swapping)
1153 * from ordinary buffer allocations, and only async requests are allowed
1154 * to sleep waiting for buffer heads.
1156 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1158 struct buffer_head *bh, *head;
1159 long offset;
1161 try_again:
1162 head = NULL;
1163 offset = PAGE_SIZE;
1164 while ((offset -= size) >= 0) {
1165 bh = get_unused_buffer_head(async);
1166 if (!bh)
1167 goto no_grow;
1169 bh->b_dev = B_FREE; /* Flag as unused */
1170 bh->b_this_page = head;
1171 head = bh;
1173 bh->b_state = 0;
1174 bh->b_next_free = NULL;
1175 bh->b_pprev = NULL;
1176 atomic_set(&bh->b_count, 0);
1177 bh->b_size = size;
1179 set_bh_page(bh, page, offset);
1181 bh->b_list = BUF_CLEAN;
1182 bh->b_end_io = end_buffer_io_bad;
1184 return head;
1186 * In case anything failed, we just free everything we got.
1188 no_grow:
1189 if (head) {
1190 spin_lock(&unused_list_lock);
1191 do {
1192 bh = head;
1193 head = head->b_this_page;
1194 __put_unused_buffer_head(bh);
1195 } while (head);
1196 spin_unlock(&unused_list_lock);
1198 /* Wake up any waiters ... */
1199 wake_up(&buffer_wait);
1203 * Return failure for non-async IO requests. Async IO requests
1204 * are not allowed to fail, so we have to wait until buffer heads
1205 * become available. But we don't want tasks sleeping with
1206 * partially complete buffers, so all were released above.
1208 if (!async)
1209 return NULL;
1211 /* We're _really_ low on memory. Now we just
1212 * wait for old buffer heads to become free due to
1213 * finishing IO. Since this is an async request and
1214 * the reserve list is empty, we're sure there are
1215 * async buffer heads in use.
1217 run_task_queue(&tq_disk);
1220 * Set our state for sleeping, then check again for buffer heads.
1221 * This ensures we won't miss a wake_up from an interrupt.
1223 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1224 goto try_again;
1227 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1229 struct buffer_head *head, *bh, *tail;
1230 int block;
1232 if (!PageLocked(page))
1233 BUG();
1235 * Allocate async buffer heads pointing to this page, just for I/O.
1236 * They don't show up in the buffer hash table, but they *are*
1237 * registered in page->buffers.
1239 head = create_buffers(page, size, 1);
1240 if (page->buffers)
1241 BUG();
1242 if (!head)
1243 BUG();
1244 tail = head;
1245 for (bh = head; bh; bh = bh->b_this_page) {
1246 block = *(b++);
1248 tail = bh;
1249 init_buffer(bh, end_buffer_io_async, NULL);
1250 bh->b_dev = dev;
1251 bh->b_blocknr = block;
1253 set_bit(BH_Mapped, &bh->b_state);
1255 tail->b_this_page = head;
1256 page_cache_get(page);
1257 page->buffers = head;
1258 return 0;
1261 static void unmap_buffer(struct buffer_head * bh)
1263 if (buffer_mapped(bh)) {
1264 mark_buffer_clean(bh);
1265 wait_on_buffer(bh);
1266 clear_bit(BH_Uptodate, &bh->b_state);
1267 clear_bit(BH_Mapped, &bh->b_state);
1268 clear_bit(BH_Req, &bh->b_state);
1269 clear_bit(BH_New, &bh->b_state);
1274 * We don't have to release all buffers here, but
1275 * we have to be sure that no dirty buffer is left
1276 * and no IO is going on (no buffer is locked), because
1277 * we have truncated the file and are going to free the
1278 * blocks on-disk..
1280 int block_flushpage(struct page *page, unsigned long offset)
1282 struct buffer_head *head, *bh, *next;
1283 unsigned int curr_off = 0;
1285 if (!PageLocked(page))
1286 BUG();
1287 if (!page->buffers)
1288 return 1;
1290 head = page->buffers;
1291 bh = head;
1292 do {
1293 unsigned int next_off = curr_off + bh->b_size;
1294 next = bh->b_this_page;
1297 * is this block fully flushed?
1299 if (offset <= curr_off)
1300 unmap_buffer(bh);
1301 curr_off = next_off;
1302 bh = next;
1303 } while (bh != head);
1306 * subtle. We release buffer-heads only if this is
1307 * the 'final' flushpage. We have invalidated the get_block
1308 * cached value unconditionally, so real IO is not
1309 * possible anymore.
1311 * If the free doesn't work out, the buffers can be
1312 * left around - they just turn into anonymous buffers
1313 * instead.
1315 if (!offset) {
1316 if (!try_to_free_buffers(page, 0)) {
1317 atomic_inc(&buffermem_pages);
1318 return 0;
1322 return 1;
1325 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1327 struct buffer_head *bh, *head, *tail;
1329 head = create_buffers(page, blocksize, 1);
1330 if (page->buffers)
1331 BUG();
1333 bh = head;
1334 do {
1335 bh->b_dev = inode->i_dev;
1336 bh->b_blocknr = 0;
1337 bh->b_end_io = end_buffer_io_bad;
1338 tail = bh;
1339 bh = bh->b_this_page;
1340 } while (bh);
1341 tail->b_this_page = head;
1342 page->buffers = head;
1343 page_cache_get(page);
1346 static void unmap_underlying_metadata(struct buffer_head * bh)
1348 struct buffer_head *old_bh;
1350 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1351 if (old_bh) {
1352 unmap_buffer(old_bh);
1353 /* Here we could run brelse or bforget. We use
1354 bforget because it will try to put the buffer
1355 in the freelist. */
1356 __bforget(old_bh);
1361 * block_write_full_page() is SMP-safe - currently it's still
1362 * being called with the kernel lock held, but the code is ready.
1364 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1366 int err, i, need_balance_dirty = 0;
1367 unsigned long block;
1368 struct buffer_head *bh, *head;
1370 if (!PageLocked(page))
1371 BUG();
1373 if (!page->buffers)
1374 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1375 head = page->buffers;
1377 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1379 bh = head;
1380 i = 0;
1381 do {
1383 * If the buffer isn't up-to-date, we can't be sure
1384 * that the buffer has been initialized with the proper
1385 * block number information etc..
1387 * Leave it to the low-level FS to make all those
1388 * decisions (block #0 may actually be a valid block)
1390 bh->b_end_io = end_buffer_io_sync;
1391 if (!buffer_mapped(bh)) {
1392 err = get_block(inode, block, bh, 1);
1393 if (err)
1394 goto out;
1395 if (buffer_new(bh))
1396 unmap_underlying_metadata(bh);
1398 set_bit(BH_Uptodate, &bh->b_state);
1399 if (!atomic_set_buffer_dirty(bh)) {
1400 __mark_dirty(bh, 0);
1401 need_balance_dirty = 1;
1404 bh = bh->b_this_page;
1405 block++;
1406 } while (bh != head);
1408 if (need_balance_dirty)
1409 balance_dirty(bh->b_dev);
1411 SetPageUptodate(page);
1412 return 0;
1413 out:
1414 ClearPageUptodate(page);
1415 return err;
1418 static int __block_prepare_write(struct inode *inode, struct page *page,
1419 unsigned from, unsigned to, get_block_t *get_block)
1421 unsigned block_start, block_end;
1422 unsigned long block;
1423 int err = 0;
1424 unsigned blocksize, bbits;
1425 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1426 char *kaddr = (char *)kmap(page);
1428 blocksize = inode->i_sb->s_blocksize;
1429 if (!page->buffers)
1430 create_empty_buffers(page, inode, blocksize);
1431 head = page->buffers;
1433 bbits = inode->i_sb->s_blocksize_bits;
1434 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1436 for(bh = head, block_start = 0; bh != head || !block_start;
1437 block++, block_start=block_end, bh = bh->b_this_page) {
1438 if (!bh)
1439 BUG();
1440 block_end = block_start+blocksize;
1441 if (block_end <= from)
1442 continue;
1443 if (block_start >= to)
1444 break;
1445 bh->b_end_io = end_buffer_io_sync;
1446 if (!buffer_mapped(bh)) {
1447 err = get_block(inode, block, bh, 1);
1448 if (err)
1449 goto out;
1450 if (buffer_new(bh)) {
1451 unmap_underlying_metadata(bh);
1452 if (block_end > to)
1453 memset(kaddr+to, 0, block_end-to);
1454 if (block_start < from)
1455 memset(kaddr+block_start, 0, from-block_start);
1456 continue;
1459 if (!buffer_uptodate(bh) &&
1460 (block_start < from || block_end > to)) {
1461 ll_rw_block(READ, 1, &bh);
1462 *wait_bh++=bh;
1466 * If we issued read requests - let them complete.
1468 while(wait_bh > wait) {
1469 wait_on_buffer(*--wait_bh);
1470 err = -EIO;
1471 if (!buffer_uptodate(*wait_bh))
1472 goto out;
1474 return 0;
1475 out:
1476 return err;
1479 static int __block_commit_write(struct inode *inode, struct page *page,
1480 unsigned from, unsigned to)
1482 unsigned block_start, block_end;
1483 int partial = 0, need_balance_dirty = 0;
1484 unsigned blocksize;
1485 struct buffer_head *bh, *head;
1487 blocksize = inode->i_sb->s_blocksize;
1489 for(bh = head = page->buffers, block_start = 0;
1490 bh != head || !block_start;
1491 block_start=block_end, bh = bh->b_this_page) {
1492 block_end = block_start + blocksize;
1493 if (block_end <= from || block_start >= to) {
1494 if (!buffer_uptodate(bh))
1495 partial = 1;
1496 } else {
1497 set_bit(BH_Uptodate, &bh->b_state);
1498 if (!atomic_set_buffer_dirty(bh)) {
1499 __mark_dirty(bh, 0);
1500 need_balance_dirty = 1;
1505 if (need_balance_dirty)
1506 balance_dirty(bh->b_dev);
1508 * is this a partial write that happened to make all buffers
1509 * uptodate then we can optimize away a bogus readpage() for
1510 * the next read(). Here we 'discover' wether the page went
1511 * uptodate as a result of this (potentially partial) write.
1513 if (!partial)
1514 SetPageUptodate(page);
1515 return 0;
1519 * Generic "read page" function for block devices that have the normal
1520 * get_block functionality. This is most of the block device filesystems.
1521 * Reads the page asynchronously --- the unlock_buffer() and
1522 * mark_buffer_uptodate() functions propagate buffer state into the
1523 * page struct once IO has completed.
1525 int block_read_full_page(struct page *page, get_block_t *get_block)
1527 struct inode *inode = (struct inode*)page->mapping->host;
1528 unsigned long iblock, lblock;
1529 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1530 unsigned int blocksize, blocks;
1531 unsigned long kaddr = 0;
1532 int nr, i;
1534 if (!PageLocked(page))
1535 PAGE_BUG(page);
1536 blocksize = inode->i_sb->s_blocksize;
1537 if (!page->buffers)
1538 create_empty_buffers(page, inode, blocksize);
1539 head = page->buffers;
1541 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1542 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1543 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1544 bh = head;
1545 nr = 0;
1546 i = 0;
1548 do {
1549 if (buffer_uptodate(bh))
1550 continue;
1552 if (!buffer_mapped(bh)) {
1553 if (iblock < lblock)
1554 get_block(inode, iblock, bh, 0);
1555 if (!buffer_mapped(bh)) {
1556 if (!kaddr)
1557 kaddr = kmap(page);
1558 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1559 set_bit(BH_Uptodate, &bh->b_state);
1560 continue;
1564 init_buffer(bh, end_buffer_io_async, NULL);
1565 atomic_inc(&bh->b_count);
1566 arr[nr] = bh;
1567 nr++;
1568 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1570 if (nr) {
1571 if (Page_Uptodate(page))
1572 BUG();
1573 ll_rw_block(READ, nr, arr);
1574 } else {
1576 * all buffers are uptodate - we can set the page
1577 * uptodate as well.
1579 SetPageUptodate(page);
1580 UnlockPage(page);
1582 if (kaddr)
1583 kunmap(page);
1584 return 0;
1588 * For moronic filesystems that do not allow holes in file.
1589 * We may have to extend the file.
1592 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1594 struct address_space *mapping = page->mapping;
1595 struct inode *inode = (struct inode*)mapping->host;
1596 struct page *new_page;
1597 unsigned long pgpos;
1598 long status;
1599 unsigned zerofrom;
1600 unsigned blocksize = inode->i_sb->s_blocksize;
1601 char *kaddr;
1603 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1604 status = -ENOMEM;
1605 new_page = grab_cache_page(mapping, pgpos);
1606 if (!new_page)
1607 goto out;
1608 /* we might sleep */
1609 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1610 UnlockPage(new_page);
1611 page_cache_release(new_page);
1612 continue;
1614 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1615 if (zerofrom & (blocksize-1)) {
1616 *bytes |= (blocksize-1);
1617 (*bytes)++;
1619 status = __block_prepare_write(inode, new_page, zerofrom,
1620 PAGE_CACHE_SIZE, get_block);
1621 if (status)
1622 goto out_unmap;
1623 kaddr = (char*)page_address(new_page);
1624 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1625 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1626 kunmap(new_page);
1627 UnlockPage(new_page);
1628 page_cache_release(new_page);
1631 if (page->index < pgpos) {
1632 /* completely inside the area */
1633 zerofrom = offset;
1634 } else {
1635 /* page covers the boundary, find the boundary offset */
1636 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1638 /* if we will expand the thing last block will be filled */
1639 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1640 *bytes |= (blocksize-1);
1641 (*bytes)++;
1644 /* starting below the boundary? Nothing to zero out */
1645 if (offset <= zerofrom)
1646 zerofrom = offset;
1648 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1649 if (status)
1650 goto out1;
1651 kaddr = (char*)page_address(page);
1652 if (zerofrom < offset) {
1653 memset(kaddr+zerofrom, 0, offset-zerofrom);
1654 __block_commit_write(inode, page, zerofrom, offset);
1656 return 0;
1657 out1:
1658 ClearPageUptodate(page);
1659 kunmap(page);
1660 return status;
1662 out_unmap:
1663 ClearPageUptodate(new_page);
1664 kunmap(new_page);
1665 UnlockPage(new_page);
1666 page_cache_release(new_page);
1667 out:
1668 return status;
1671 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1672 get_block_t *get_block)
1674 struct inode *inode = (struct inode*)page->mapping->host;
1675 int err = __block_prepare_write(inode, page, from, to, get_block);
1676 if (err) {
1677 ClearPageUptodate(page);
1678 kunmap(page);
1680 return err;
1683 int generic_commit_write(struct file *file, struct page *page,
1684 unsigned from, unsigned to)
1686 struct inode *inode = (struct inode*)page->mapping->host;
1687 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1688 __block_commit_write(inode,page,from,to);
1689 kunmap(page);
1690 if (pos > inode->i_size)
1691 inode->i_size = pos;
1692 return 0;
1695 int block_write_full_page(struct page *page, get_block_t *get_block)
1697 struct inode *inode = (struct inode*)page->mapping->host;
1698 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1699 unsigned offset;
1700 int err;
1702 /* easy case */
1703 if (page->index < end_index)
1704 return __block_write_full_page(inode, page, get_block);
1706 /* things got complicated... */
1707 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1708 /* OK, are we completely out? */
1709 if (page->index >= end_index+1 || !offset)
1710 return -EIO;
1711 /* Sigh... will have to work, then... */
1712 err = __block_prepare_write(inode, page, 0, offset, get_block);
1713 if (!err) {
1714 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1715 __block_commit_write(inode,page,0,offset);
1716 done:
1717 kunmap(page);
1718 return err;
1720 ClearPageUptodate(page);
1721 goto done;
1724 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1726 struct buffer_head tmp;
1727 struct inode *inode = (struct inode*)mapping->host;
1728 tmp.b_state = 0;
1729 tmp.b_blocknr = 0;
1730 get_block(inode, block, &tmp, 0);
1731 return tmp.b_blocknr;
1735 * IO completion routine for a buffer_head being used for kiobuf IO: we
1736 * can't dispatch the kiobuf callback until io_count reaches 0.
1739 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1741 struct kiobuf *kiobuf;
1743 mark_buffer_uptodate(bh, uptodate);
1745 kiobuf = bh->b_private;
1746 unlock_buffer(bh);
1747 end_kio_request(kiobuf, uptodate);
1752 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1753 * for them to complete. Clean up the buffer_heads afterwards.
1756 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1758 int iosize;
1759 int i;
1760 struct buffer_head *tmp;
1762 if (rw == WRITE)
1763 rw = WRITERAW;
1764 ll_rw_block(rw, nr, bh);
1766 iosize = 0;
1767 spin_lock(&unused_list_lock);
1769 for (i = nr; --i >= 0; ) {
1770 iosize += size;
1771 tmp = bh[i];
1772 if (buffer_locked(tmp)) {
1773 spin_unlock(&unused_list_lock);
1774 wait_on_buffer(tmp);
1775 spin_lock(&unused_list_lock);
1778 if (!buffer_uptodate(tmp)) {
1779 /* We are traversing bh'es in reverse order so
1780 clearing iosize on error calculates the
1781 amount of IO before the first error. */
1782 iosize = 0;
1784 __put_unused_buffer_head(tmp);
1787 spin_unlock(&unused_list_lock);
1789 return iosize;
1793 * Start I/O on a physical range of kernel memory, defined by a vector
1794 * of kiobuf structs (much like a user-space iovec list).
1796 * The kiobuf must already be locked for IO. IO is submitted
1797 * asynchronously: you need to check page->locked, page->uptodate, and
1798 * maybe wait on page->wait.
1800 * It is up to the caller to make sure that there are enough blocks
1801 * passed in to completely map the iobufs to disk.
1804 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1805 kdev_t dev, unsigned long b[], int size)
1807 int err;
1808 int length;
1809 int transferred;
1810 int i;
1811 int bufind;
1812 int pageind;
1813 int bhind;
1814 int offset;
1815 unsigned long blocknr;
1816 struct kiobuf * iobuf = NULL;
1817 struct page * map;
1818 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1820 if (!nr)
1821 return 0;
1824 * First, do some alignment and validity checks
1826 for (i = 0; i < nr; i++) {
1827 iobuf = iovec[i];
1828 if ((iobuf->offset & (size-1)) ||
1829 (iobuf->length & (size-1)))
1830 return -EINVAL;
1831 if (!iobuf->nr_pages)
1832 panic("brw_kiovec: iobuf not initialised");
1836 * OK to walk down the iovec doing page IO on each page we find.
1838 bufind = bhind = transferred = err = 0;
1839 for (i = 0; i < nr; i++) {
1840 iobuf = iovec[i];
1841 offset = iobuf->offset;
1842 length = iobuf->length;
1843 iobuf->errno = 0;
1845 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1846 map = iobuf->maplist[pageind];
1847 if (!map) {
1848 err = -EFAULT;
1849 goto error;
1852 while (length > 0) {
1853 blocknr = b[bufind++];
1854 tmp = get_unused_buffer_head(0);
1855 if (!tmp) {
1856 err = -ENOMEM;
1857 goto error;
1860 tmp->b_dev = B_FREE;
1861 tmp->b_size = size;
1862 set_bh_page(tmp, map, offset);
1863 tmp->b_this_page = tmp;
1865 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
1866 tmp->b_dev = dev;
1867 tmp->b_blocknr = blocknr;
1868 tmp->b_state = 1 << BH_Mapped;
1870 if (rw == WRITE) {
1871 set_bit(BH_Uptodate, &tmp->b_state);
1872 set_bit(BH_Dirty, &tmp->b_state);
1875 bh[bhind++] = tmp;
1876 length -= size;
1877 offset += size;
1879 atomic_inc(&iobuf->io_count);
1882 * Start the IO if we have got too much
1884 if (bhind >= KIO_MAX_SECTORS) {
1885 err = do_kio(rw, bhind, bh, size);
1886 if (err >= 0)
1887 transferred += err;
1888 else
1889 goto finished;
1890 bhind = 0;
1893 if (offset >= PAGE_SIZE) {
1894 offset = 0;
1895 break;
1897 } /* End of block loop */
1898 } /* End of page loop */
1899 } /* End of iovec loop */
1901 /* Is there any IO still left to submit? */
1902 if (bhind) {
1903 err = do_kio(rw, bhind, bh, size);
1904 if (err >= 0)
1905 transferred += err;
1906 else
1907 goto finished;
1910 finished:
1911 if (transferred)
1912 return transferred;
1913 return err;
1915 error:
1916 /* We got an error allocating the bh'es. Just free the current
1917 buffer_heads and exit. */
1918 spin_lock(&unused_list_lock);
1919 for (i = bhind; --i >= 0; ) {
1920 __put_unused_buffer_head(bh[bhind]);
1922 spin_unlock(&unused_list_lock);
1923 goto finished;
1927 * Start I/O on a page.
1928 * This function expects the page to be locked and may return
1929 * before I/O is complete. You then have to check page->locked,
1930 * page->uptodate, and maybe wait on page->wait.
1932 * brw_page() is SMP-safe, although it's being called with the
1933 * kernel lock held - but the code is ready.
1935 * FIXME: we need a swapper_inode->get_block function to remove
1936 * some of the bmap kludges and interface ugliness here.
1938 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1940 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1941 int nr, fresh /* temporary debugging flag */, block;
1943 if (!PageLocked(page))
1944 panic("brw_page: page not locked for I/O");
1945 // ClearPageError(page);
1947 * We pretty much rely on the page lock for this, because
1948 * create_page_buffers() might sleep.
1950 fresh = 0;
1951 if (!page->buffers) {
1952 create_page_buffers(rw, page, dev, b, size);
1953 fresh = 1;
1955 if (!page->buffers)
1956 BUG();
1958 head = page->buffers;
1959 bh = head;
1960 nr = 0;
1961 do {
1962 block = *(b++);
1964 if (fresh && (atomic_read(&bh->b_count) != 0))
1965 BUG();
1966 if (rw == READ) {
1967 if (!fresh)
1968 BUG();
1969 if (!buffer_uptodate(bh)) {
1970 arr[nr++] = bh;
1971 atomic_inc(&bh->b_count);
1973 } else { /* WRITE */
1974 if (!bh->b_blocknr) {
1975 if (!block)
1976 BUG();
1977 bh->b_blocknr = block;
1978 } else {
1979 if (!block)
1980 BUG();
1982 set_bit(BH_Uptodate, &bh->b_state);
1983 set_bit(BH_Dirty, &bh->b_state);
1984 arr[nr++] = bh;
1985 atomic_inc(&bh->b_count);
1987 bh = bh->b_this_page;
1988 } while (bh != head);
1989 if ((rw == READ) && nr) {
1990 if (Page_Uptodate(page))
1991 BUG();
1992 ll_rw_block(rw, nr, arr);
1993 } else {
1994 if (!nr && rw == READ) {
1995 SetPageUptodate(page);
1996 UnlockPage(page);
1998 if (nr && (rw == WRITE))
1999 ll_rw_block(rw, nr, arr);
2001 return 0;
2004 int block_symlink(struct inode *inode, const char *symname, int len)
2006 struct address_space *mapping = inode->i_mapping;
2007 struct page *page = grab_cache_page(mapping, 0);
2008 int err = -ENOMEM;
2009 char *kaddr;
2011 if (!page)
2012 goto fail;
2013 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2014 if (err)
2015 goto fail_map;
2016 kaddr = (char*)page_address(page);
2017 memcpy(kaddr, symname, len-1);
2018 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2020 * Notice that we are _not_ going to block here - end of page is
2021 * unmapped, so this will only try to map the rest of page, see
2022 * that it is unmapped (typically even will not look into inode -
2023 * ->i_size will be enough for everything) and zero it out.
2024 * OTOH it's obviously correct and should make the page up-to-date.
2026 err = mapping->a_ops->readpage(NULL, page);
2027 wait_on_page(page);
2028 page_cache_release(page);
2029 if (err < 0)
2030 goto fail;
2031 mark_inode_dirty(inode);
2032 return 0;
2033 fail_map:
2034 UnlockPage(page);
2035 page_cache_release(page);
2036 fail:
2037 return err;
2041 * Try to increase the number of buffers available: the size argument
2042 * is used to determine what kind of buffers we want.
2044 static int grow_buffers(int size)
2046 struct page * page;
2047 struct buffer_head *bh, *tmp;
2048 struct buffer_head * insert_point;
2049 int isize;
2051 if ((size & 511) || (size > PAGE_SIZE)) {
2052 printk("VFS: grow_buffers: size = %d\n",size);
2053 return 0;
2056 page = alloc_page(GFP_BUFFER);
2057 if (!page)
2058 goto out;
2059 bh = create_buffers(page, size, 0);
2060 if (!bh)
2061 goto no_buffer_head;
2063 isize = BUFSIZE_INDEX(size);
2065 spin_lock(&free_list[isize].lock);
2066 insert_point = free_list[isize].list;
2067 tmp = bh;
2068 while (1) {
2069 if (insert_point) {
2070 tmp->b_next_free = insert_point->b_next_free;
2071 tmp->b_prev_free = insert_point;
2072 insert_point->b_next_free->b_prev_free = tmp;
2073 insert_point->b_next_free = tmp;
2074 } else {
2075 tmp->b_prev_free = tmp;
2076 tmp->b_next_free = tmp;
2078 insert_point = tmp;
2079 if (tmp->b_this_page)
2080 tmp = tmp->b_this_page;
2081 else
2082 break;
2084 tmp->b_this_page = bh;
2085 free_list[isize].list = bh;
2086 spin_unlock(&free_list[isize].lock);
2088 page->buffers = bh;
2089 page->flags &= ~(1 << PG_referenced);
2090 lru_cache_add(page);
2091 atomic_inc(&buffermem_pages);
2092 return 1;
2094 no_buffer_head:
2095 page_cache_release(page);
2096 out:
2097 return 0;
2101 * Sync all the buffers on one page..
2103 * If we have old buffers that are locked, we'll
2104 * wait on them, but we won't wait on the new ones
2105 * we're writing out now.
2107 * This all is required so that we can free up memory
2108 * later.
2110 static void sync_page_buffers(struct buffer_head *bh, int wait)
2112 struct buffer_head * tmp = bh;
2114 do {
2115 struct buffer_head *p = tmp;
2116 tmp = tmp->b_this_page;
2117 if (buffer_locked(p)) {
2118 if (wait)
2119 __wait_on_buffer(p);
2120 } else if (buffer_dirty(p))
2121 ll_rw_block(WRITE, 1, &p);
2122 } while (tmp != bh);
2126 * Can the buffer be thrown out?
2128 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2129 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2132 * try_to_free_buffers() checks if all the buffers on this particular page
2133 * are unused, and free's the page if so.
2135 * Wake up bdflush() if this fails - if we're running low on memory due
2136 * to dirty buffers, we need to flush them out as quickly as possible.
2138 * NOTE: There are quite a number of ways that threads of control can
2139 * obtain a reference to a buffer head within a page. So we must
2140 * lock out all of these paths to cleanly toss the page.
2142 int try_to_free_buffers(struct page * page, int wait)
2144 struct buffer_head * tmp, * bh = page->buffers;
2145 int index = BUFSIZE_INDEX(bh->b_size);
2147 spin_lock(&lru_list_lock);
2148 write_lock(&hash_table_lock);
2149 spin_lock(&free_list[index].lock);
2150 tmp = bh;
2151 do {
2152 struct buffer_head *p = tmp;
2154 tmp = tmp->b_this_page;
2155 if (buffer_busy(p))
2156 goto busy_buffer_page;
2157 } while (tmp != bh);
2159 spin_lock(&unused_list_lock);
2160 tmp = bh;
2161 do {
2162 struct buffer_head * p = tmp;
2163 tmp = tmp->b_this_page;
2165 /* The buffer can be either on the regular
2166 * queues or on the free list..
2168 if (p->b_dev != B_FREE)
2169 __remove_from_queues(p);
2170 else
2171 __remove_from_free_list(p, index);
2172 __put_unused_buffer_head(p);
2173 } while (tmp != bh);
2174 spin_unlock(&unused_list_lock);
2176 /* Wake up anyone waiting for buffer heads */
2177 wake_up(&buffer_wait);
2179 /* And free the page */
2180 page->buffers = NULL;
2181 page_cache_release(page);
2182 spin_unlock(&free_list[index].lock);
2183 write_unlock(&hash_table_lock);
2184 spin_unlock(&lru_list_lock);
2185 return 1;
2187 busy_buffer_page:
2188 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2189 spin_unlock(&free_list[index].lock);
2190 write_unlock(&hash_table_lock);
2191 spin_unlock(&lru_list_lock);
2192 sync_page_buffers(bh, wait);
2193 return 0;
2196 /* ================== Debugging =================== */
2198 void show_buffers(void)
2200 #ifdef CONFIG_SMP
2201 struct buffer_head * bh;
2202 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2203 int protected = 0;
2204 int nlist;
2205 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2206 #endif
2208 printk("Buffer memory: %6dkB\n",
2209 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2211 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2212 if (!spin_trylock(&lru_list_lock))
2213 return;
2214 for(nlist = 0; nlist < NR_LIST; nlist++) {
2215 found = locked = dirty = used = lastused = protected = 0;
2216 bh = lru_list[nlist];
2217 if(!bh) continue;
2219 do {
2220 found++;
2221 if (buffer_locked(bh))
2222 locked++;
2223 if (buffer_protected(bh))
2224 protected++;
2225 if (buffer_dirty(bh))
2226 dirty++;
2227 if (atomic_read(&bh->b_count))
2228 used++, lastused = found;
2229 bh = bh->b_next_free;
2230 } while (bh != lru_list[nlist]);
2232 int tmp = nr_buffers_type[nlist];
2233 if (found != tmp)
2234 printk("%9s: BUG -> found %d, reported %d\n",
2235 buf_types[nlist], found, tmp);
2237 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2238 "%d locked, %d protected, %d dirty\n",
2239 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2240 used, lastused, locked, protected, dirty);
2242 spin_unlock(&lru_list_lock);
2243 #endif
2246 /* ===================== Init ======================= */
2249 * allocate the hash table and init the free list
2250 * Use gfp() for the hash table to decrease TLB misses, use
2251 * SLAB cache for buffer heads.
2253 void __init buffer_init(unsigned long mempages)
2255 int order, i;
2256 unsigned int nr_hash;
2258 /* The buffer cache hash table is less important these days,
2259 * trim it a bit.
2261 mempages >>= 14;
2263 mempages *= sizeof(struct buffer_head *);
2265 for (order = 0; (1 << order) < mempages; order++)
2268 /* try to allocate something until we get it or we're asking
2269 for something that is really too small */
2271 do {
2272 unsigned long tmp;
2274 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2275 bh_hash_mask = (nr_hash - 1);
2277 tmp = nr_hash;
2278 bh_hash_shift = 0;
2279 while((tmp >>= 1UL) != 0UL)
2280 bh_hash_shift++;
2282 hash_table = (struct buffer_head **)
2283 __get_free_pages(GFP_ATOMIC, order);
2284 } while (hash_table == NULL && --order > 0);
2285 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2286 nr_hash, order, (PAGE_SIZE << order));
2288 if (!hash_table)
2289 panic("Failed to allocate buffer hash table\n");
2291 /* Setup hash chains. */
2292 for(i = 0; i < nr_hash; i++)
2293 hash_table[i] = NULL;
2295 /* Setup free lists. */
2296 for(i = 0; i < NR_SIZES; i++) {
2297 free_list[i].list = NULL;
2298 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2301 /* Setup lru lists. */
2302 for(i = 0; i < NR_LIST; i++)
2303 lru_list[i] = NULL;
2305 bh_cachep = kmem_cache_create("buffer_head",
2306 sizeof(struct buffer_head),
2308 SLAB_HWCACHE_ALIGN, NULL, NULL);
2309 if(!bh_cachep)
2310 panic("Cannot create buffer head SLAB cache\n");
2314 /* ====================== bdflush support =================== */
2316 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2317 * response to dirty buffers. Once this process is activated, we write back
2318 * a limited number of buffers to the disks and then go back to sleep again.
2320 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2321 struct task_struct *bdflush_tsk = 0;
2323 void wakeup_bdflush(int block)
2325 DECLARE_WAITQUEUE(wait, current);
2327 if (current == bdflush_tsk)
2328 return;
2330 if (!block) {
2331 wake_up_process(bdflush_tsk);
2332 return;
2335 /* kflushd can wakeup us before we have a chance to
2336 go to sleep so we must be smart in handling
2337 this wakeup event from kflushd to avoid deadlocking in SMP
2338 (we are not holding any lock anymore in these two paths). */
2339 __set_current_state(TASK_UNINTERRUPTIBLE);
2340 add_wait_queue(&bdflush_done, &wait);
2342 wake_up_process(bdflush_tsk);
2343 schedule();
2345 remove_wait_queue(&bdflush_done, &wait);
2346 __set_current_state(TASK_RUNNING);
2349 /* This is the _only_ function that deals with flushing async writes
2350 to disk.
2351 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2352 as all dirty buffers lives _only_ in the DIRTY lru list.
2353 As we never browse the LOCKED and CLEAN lru lists they are infact
2354 completly useless. */
2355 static int flush_dirty_buffers(int check_flushtime)
2357 struct buffer_head * bh, *next;
2358 int flushed = 0, i;
2360 restart:
2361 spin_lock(&lru_list_lock);
2362 bh = lru_list[BUF_DIRTY];
2363 if (!bh)
2364 goto out_unlock;
2365 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2366 next = bh->b_next_free;
2368 if (!buffer_dirty(bh)) {
2369 __refile_buffer(bh);
2370 continue;
2372 if (buffer_locked(bh))
2373 continue;
2375 if (check_flushtime) {
2376 /* The dirty lru list is chronologically ordered so
2377 if the current bh is not yet timed out,
2378 then also all the following bhs
2379 will be too young. */
2380 if (time_before(jiffies, bh->b_flushtime))
2381 goto out_unlock;
2382 } else {
2383 if (++flushed > bdf_prm.b_un.ndirty)
2384 goto out_unlock;
2387 /* OK, now we are committed to write it out. */
2388 atomic_inc(&bh->b_count);
2389 spin_unlock(&lru_list_lock);
2390 ll_rw_block(WRITE, 1, &bh);
2391 atomic_dec(&bh->b_count);
2393 if (current->need_resched)
2394 schedule();
2395 goto restart;
2397 out_unlock:
2398 spin_unlock(&lru_list_lock);
2400 return flushed;
2404 * Here we attempt to write back old buffers. We also try to flush inodes
2405 * and supers as well, since this function is essentially "update", and
2406 * otherwise there would be no way of ensuring that these quantities ever
2407 * get written back. Ideally, we would have a timestamp on the inodes
2408 * and superblocks so that we could write back only the old ones as well
2411 static int sync_old_buffers(void)
2413 lock_kernel();
2414 sync_supers(0);
2415 sync_inodes(0);
2416 unlock_kernel();
2418 flush_dirty_buffers(1);
2419 /* must really sync all the active I/O request to disk here */
2420 run_task_queue(&tq_disk);
2421 return 0;
2424 int block_sync_page(struct page *page)
2426 run_task_queue(&tq_disk);
2427 return 0;
2430 /* This is the interface to bdflush. As we get more sophisticated, we can
2431 * pass tuning parameters to this "process", to adjust how it behaves.
2432 * We would want to verify each parameter, however, to make sure that it
2433 * is reasonable. */
2435 asmlinkage long sys_bdflush(int func, long data)
2437 if (!capable(CAP_SYS_ADMIN))
2438 return -EPERM;
2440 if (func == 1) {
2441 /* do_exit directly and let kupdate to do its work alone. */
2442 do_exit(0);
2443 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2444 a syscall that doesn't care about the current mm context. */
2445 int error;
2446 struct mm_struct *user_mm;
2449 * bdflush will spend all of it's time in kernel-space,
2450 * without touching user-space, so we can switch it into
2451 * 'lazy TLB mode' to reduce the cost of context-switches
2452 * to and from bdflush.
2454 user_mm = start_lazy_tlb();
2455 error = sync_old_buffers();
2456 end_lazy_tlb(user_mm);
2457 return error;
2458 #endif
2461 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2462 if (func >= 2) {
2463 int i = (func-2) >> 1;
2464 if (i >= 0 && i < N_PARAM) {
2465 if ((func & 1) == 0)
2466 return put_user(bdf_prm.data[i], (int*)data);
2468 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2469 bdf_prm.data[i] = data;
2470 return 0;
2473 return -EINVAL;
2476 /* Having func 0 used to launch the actual bdflush and then never
2477 * return (unless explicitly killed). We return zero here to
2478 * remain semi-compatible with present update(8) programs.
2480 return 0;
2484 * This is the actual bdflush daemon itself. It used to be started from
2485 * the syscall above, but now we launch it ourselves internally with
2486 * kernel_thread(...) directly after the first thread in init/main.c
2488 int bdflush(void *sem)
2490 struct task_struct *tsk = current;
2491 int flushed;
2493 * We have a bare-bones task_struct, and really should fill
2494 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2495 * display semi-sane things. Not real crucial though...
2498 tsk->session = 1;
2499 tsk->pgrp = 1;
2500 strcpy(tsk->comm, "kflushd");
2501 bdflush_tsk = tsk;
2503 /* avoid getting signals */
2504 spin_lock_irq(&tsk->sigmask_lock);
2505 flush_signals(tsk);
2506 sigfillset(&tsk->blocked);
2507 recalc_sigpending(tsk);
2508 spin_unlock_irq(&tsk->sigmask_lock);
2510 up((struct semaphore *)sem);
2512 for (;;) {
2513 CHECK_EMERGENCY_SYNC
2515 flushed = flush_dirty_buffers(0);
2517 /* If wakeup_bdflush will wakeup us
2518 after our bdflush_done wakeup, then
2519 we must make sure to not sleep
2520 in schedule_timeout otherwise
2521 wakeup_bdflush may wait for our
2522 bdflush_done wakeup that would never arrive
2523 (as we would be sleeping) and so it would
2524 deadlock in SMP. */
2525 __set_current_state(TASK_INTERRUPTIBLE);
2526 wake_up(&bdflush_done);
2528 * If there are still a lot of dirty buffers around,
2529 * skip the sleep and flush some more. Otherwise, we
2530 * go to sleep waiting a wakeup.
2532 if (!flushed || balance_dirty_state(NODEV) < 0)
2533 schedule();
2534 /* Remember to mark us as running otherwise
2535 the next schedule will block. */
2536 __set_current_state(TASK_RUNNING);
2541 * This is the kernel update daemon. It was used to live in userspace
2542 * but since it's need to run safely we want it unkillable by mistake.
2543 * You don't need to change your userspace configuration since
2544 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2546 int kupdate(void *sem)
2548 struct task_struct * tsk = current;
2549 int interval;
2551 tsk->session = 1;
2552 tsk->pgrp = 1;
2553 strcpy(tsk->comm, "kupdate");
2555 /* sigstop and sigcont will stop and wakeup kupdate */
2556 spin_lock_irq(&tsk->sigmask_lock);
2557 sigfillset(&tsk->blocked);
2558 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2559 recalc_sigpending(tsk);
2560 spin_unlock_irq(&tsk->sigmask_lock);
2562 up((struct semaphore *)sem);
2564 for (;;) {
2565 /* update interval */
2566 interval = bdf_prm.b_un.interval;
2567 if (interval) {
2568 tsk->state = TASK_INTERRUPTIBLE;
2569 schedule_timeout(interval);
2570 } else {
2571 stop_kupdate:
2572 tsk->state = TASK_STOPPED;
2573 schedule(); /* wait for SIGCONT */
2575 /* check for sigstop */
2576 if (signal_pending(tsk)) {
2577 int stopped = 0;
2578 spin_lock_irq(&tsk->sigmask_lock);
2579 if (sigismember(&tsk->signal, SIGSTOP)) {
2580 sigdelset(&tsk->signal, SIGSTOP);
2581 stopped = 1;
2583 recalc_sigpending(tsk);
2584 spin_unlock_irq(&tsk->sigmask_lock);
2585 if (stopped)
2586 goto stop_kupdate;
2588 #ifdef DEBUG
2589 printk("kupdate() activated...\n");
2590 #endif
2591 sync_old_buffers();
2595 static int __init bdflush_init(void)
2597 DECLARE_MUTEX_LOCKED(sem);
2598 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2599 down(&sem);
2600 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2601 down(&sem);
2602 return 0;
2605 module_init(bdflush_init)