Import 2.3.99pre7-9
[davej-history.git] / fs / buffer.c
blob4a2ee972bfc004f6ba6ffd4e87ccca9e56773b1b
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
53 #define NR_SIZES 7
54 static char buffersize_index[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 6};
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
72 * Hash table gook..
74 static unsigned int bh_hash_mask = 0;
75 static unsigned int bh_hash_shift = 0;
76 static struct buffer_head **hash_table;
77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
79 static struct buffer_head *lru_list[NR_LIST];
80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
81 static int nr_buffers_type[NR_LIST] = {0,};
82 static unsigned long size_buffers_type[NR_LIST] = {0,};
84 static struct buffer_head * unused_list = NULL;
85 static int nr_unused_buffer_heads = 0;
86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
89 struct bh_free_head {
90 struct buffer_head *list;
91 spinlock_t lock;
93 static struct bh_free_head free_list[NR_SIZES];
95 kmem_cache_t *bh_cachep;
97 static int grow_buffers(int size);
98 static void __refile_buffer(struct buffer_head *);
100 /* This is used by some architectures to estimate available memory. */
101 atomic_t buffermem_pages = ATOMIC_INIT(0);
103 /* Here is the parameter block for the bdflush process. If you add or
104 * remove any of the parameters, make sure to update kernel/sysctl.c.
107 #define N_PARAM 9
109 /* The dummy values in this structure are left in there for compatibility
110 * with old programs that play with the /proc entries.
112 union bdflush_param {
113 struct {
114 int nfract; /* Percentage of buffer cache dirty to
115 activate bdflush */
116 int ndirty; /* Maximum number of dirty blocks to write out per
117 wake-cycle */
118 int nrefill; /* Number of clean buffers to try to obtain
119 each time we call refill */
120 int nref_dirt; /* Dirty buffer threshold for activating bdflush
121 when trying to refill buffers. */
122 int interval; /* jiffies delay between kupdate flushes */
123 int age_buffer; /* Time for normal buffer to age before we flush it */
124 int age_super; /* Time for superblock to age before we flush it */
125 int dummy2; /* unused */
126 int dummy3; /* unused */
127 } b_un;
128 unsigned int data[N_PARAM];
129 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
131 /* These are the min and max parameter values that we will allow to be assigned */
132 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
133 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
136 * Rewrote the wait-routines to use the "new" wait-queue functionality,
137 * and getting rid of the cli-sti pairs. The wait-queue routines still
138 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 * Note that the real wait_on_buffer() is an inline function that checks
141 * if 'b_wait' is set before calling this, so that the queues aren't set
142 * up unnecessarily.
144 void __wait_on_buffer(struct buffer_head * bh)
146 struct task_struct *tsk = current;
147 DECLARE_WAITQUEUE(wait, tsk);
149 atomic_inc(&bh->b_count);
150 add_wait_queue(&bh->b_wait, &wait);
151 do {
152 run_task_queue(&tq_disk);
153 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
154 if (!buffer_locked(bh))
155 break;
156 schedule();
157 } while (buffer_locked(bh));
158 tsk->state = TASK_RUNNING;
159 remove_wait_queue(&bh->b_wait, &wait);
160 atomic_dec(&bh->b_count);
163 /* Call sync_buffers with wait!=0 to ensure that the call does not
164 * return until all buffer writes have completed. Sync() may return
165 * before the writes have finished; fsync() may not.
168 /* Godamity-damn. Some buffers (bitmaps for filesystems)
169 * spontaneously dirty themselves without ever brelse being called.
170 * We will ultimately want to put these in a separate list, but for
171 * now we search all of the lists for dirty buffers.
173 static int sync_buffers(kdev_t dev, int wait)
175 int i, retry, pass = 0, err = 0;
176 struct buffer_head * bh, *next;
178 /* One pass for no-wait, three for wait:
179 * 0) write out all dirty, unlocked buffers;
180 * 1) write out all dirty buffers, waiting if locked;
181 * 2) wait for completion by waiting for all buffers to unlock.
183 do {
184 retry = 0;
186 /* We search all lists as a failsafe mechanism, not because we expect
187 * there to be dirty buffers on any of the other lists.
189 repeat:
190 spin_lock(&lru_list_lock);
191 bh = lru_list[BUF_DIRTY];
192 if (!bh)
193 goto repeat2;
195 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
196 next = bh->b_next_free;
198 if (!lru_list[BUF_DIRTY])
199 break;
200 if (dev && bh->b_dev != dev)
201 continue;
202 if (buffer_locked(bh)) {
203 /* Buffer is locked; skip it unless wait is
204 * requested AND pass > 0.
206 if (!wait || !pass) {
207 retry = 1;
208 continue;
210 atomic_inc(&bh->b_count);
211 spin_unlock(&lru_list_lock);
212 wait_on_buffer (bh);
213 atomic_dec(&bh->b_count);
214 goto repeat;
217 /* If an unlocked buffer is not uptodate, there has
218 * been an IO error. Skip it.
220 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
221 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
222 err = -EIO;
223 continue;
226 /* Don't write clean buffers. Don't write ANY buffers
227 * on the third pass.
229 if (!buffer_dirty(bh) || pass >= 2)
230 continue;
232 atomic_inc(&bh->b_count);
233 spin_unlock(&lru_list_lock);
234 ll_rw_block(WRITE, 1, &bh);
235 atomic_dec(&bh->b_count);
236 retry = 1;
237 goto repeat;
240 repeat2:
241 bh = lru_list[BUF_LOCKED];
242 if (!bh) {
243 spin_unlock(&lru_list_lock);
244 break;
246 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
247 next = bh->b_next_free;
249 if (!lru_list[BUF_LOCKED])
250 break;
251 if (dev && bh->b_dev != dev)
252 continue;
253 if (buffer_locked(bh)) {
254 /* Buffer is locked; skip it unless wait is
255 * requested AND pass > 0.
257 if (!wait || !pass) {
258 retry = 1;
259 continue;
261 atomic_inc(&bh->b_count);
262 spin_unlock(&lru_list_lock);
263 wait_on_buffer (bh);
264 spin_lock(&lru_list_lock);
265 atomic_dec(&bh->b_count);
266 goto repeat2;
269 spin_unlock(&lru_list_lock);
271 /* If we are waiting for the sync to succeed, and if any dirty
272 * blocks were written, then repeat; on the second pass, only
273 * wait for buffers being written (do not pass to write any
274 * more buffers on the second pass).
276 } while (wait && retry && ++pass<=2);
277 return err;
280 void sync_dev(kdev_t dev)
282 sync_supers(dev);
283 sync_inodes(dev);
284 DQUOT_SYNC(dev);
285 /* sync all the dirty buffers out to disk only _after_ all the
286 high level layers finished generated buffer dirty data
287 (or we'll return with some buffer still dirty on the blockdevice
288 so breaking the semantics of this call) */
289 sync_buffers(dev, 0);
291 * FIXME(eric) we need to sync the physical devices here.
292 * This is because some (scsi) controllers have huge amounts of
293 * cache onboard (hundreds of Mb), and we need to instruct
294 * them to commit all of the dirty memory to disk, and we should
295 * not return until this has happened.
297 * This would need to get implemented by going through the assorted
298 * layers so that each block major number can be synced, and this
299 * would call down into the upper and mid-layer scsi.
303 int fsync_dev(kdev_t dev)
305 sync_buffers(dev, 0);
307 lock_kernel();
308 sync_supers(dev);
309 sync_inodes(dev);
310 DQUOT_SYNC(dev);
311 unlock_kernel();
313 return sync_buffers(dev, 1);
316 asmlinkage long sys_sync(void)
318 fsync_dev(0);
319 return 0;
323 * filp may be NULL if called via the msync of a vma.
326 int file_fsync(struct file *filp, struct dentry *dentry)
328 struct inode * inode = dentry->d_inode;
329 struct super_block * sb;
330 kdev_t dev;
331 int ret;
333 lock_kernel();
334 /* sync the inode to buffers */
335 write_inode_now(inode);
337 /* sync the superblock to buffers */
338 sb = inode->i_sb;
339 wait_on_super(sb);
340 if (sb->s_op && sb->s_op->write_super)
341 sb->s_op->write_super(sb);
343 /* .. finally sync the buffers to disk */
344 dev = inode->i_dev;
345 ret = sync_buffers(dev, 1);
346 unlock_kernel();
347 return ret;
350 asmlinkage long sys_fsync(unsigned int fd)
352 struct file * file;
353 struct dentry * dentry;
354 struct inode * inode;
355 int err;
357 err = -EBADF;
358 file = fget(fd);
359 if (!file)
360 goto out;
362 dentry = file->f_dentry;
363 if (!dentry)
364 goto out_putf;
366 inode = dentry->d_inode;
367 if (!inode)
368 goto out_putf;
370 err = -EINVAL;
371 if (!file->f_op || !file->f_op->fsync)
372 goto out_putf;
374 /* We need to protect against concurrent writers.. */
375 down(&inode->i_sem);
376 err = file->f_op->fsync(file, dentry);
377 up(&inode->i_sem);
379 out_putf:
380 fput(file);
381 out:
382 return err;
385 asmlinkage long sys_fdatasync(unsigned int fd)
387 struct file * file;
388 struct dentry * dentry;
389 struct inode * inode;
390 int err;
392 err = -EBADF;
393 file = fget(fd);
394 if (!file)
395 goto out;
397 dentry = file->f_dentry;
398 if (!dentry)
399 goto out_putf;
401 inode = dentry->d_inode;
402 if (!inode)
403 goto out_putf;
405 err = -EINVAL;
406 if (!file->f_op || !file->f_op->fsync)
407 goto out_putf;
409 /* this needs further work, at the moment it is identical to fsync() */
410 down(&inode->i_sem);
411 err = file->f_op->fsync(file, dentry);
412 up(&inode->i_sem);
414 out_putf:
415 fput(file);
416 out:
417 return err;
420 /* After several hours of tedious analysis, the following hash
421 * function won. Do not mess with it... -DaveM
423 #define _hashfn(dev,block) \
424 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
425 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
426 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
428 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
430 if ((bh->b_next = *head) != NULL)
431 bh->b_next->b_pprev = &bh->b_next;
432 *head = bh;
433 bh->b_pprev = head;
436 static __inline__ void __hash_unlink(struct buffer_head *bh)
438 if (bh->b_pprev) {
439 if (bh->b_next)
440 bh->b_next->b_pprev = bh->b_pprev;
441 *(bh->b_pprev) = bh->b_next;
442 bh->b_pprev = NULL;
446 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
448 struct buffer_head **bhp = &lru_list[blist];
450 if(!*bhp) {
451 *bhp = bh;
452 bh->b_prev_free = bh;
454 bh->b_next_free = *bhp;
455 bh->b_prev_free = (*bhp)->b_prev_free;
456 (*bhp)->b_prev_free->b_next_free = bh;
457 (*bhp)->b_prev_free = bh;
458 nr_buffers_type[blist]++;
459 size_buffers_type[blist] += bh->b_size;
462 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
464 if (bh->b_prev_free || bh->b_next_free) {
465 bh->b_prev_free->b_next_free = bh->b_next_free;
466 bh->b_next_free->b_prev_free = bh->b_prev_free;
467 if (lru_list[blist] == bh)
468 lru_list[blist] = bh->b_next_free;
469 if (lru_list[blist] == bh)
470 lru_list[blist] = NULL;
471 bh->b_next_free = bh->b_prev_free = NULL;
472 nr_buffers_type[blist]--;
473 size_buffers_type[blist] -= bh->b_size;
477 static void __remove_from_free_list(struct buffer_head * bh, int index)
479 if(bh->b_next_free == bh)
480 free_list[index].list = NULL;
481 else {
482 bh->b_prev_free->b_next_free = bh->b_next_free;
483 bh->b_next_free->b_prev_free = bh->b_prev_free;
484 if (free_list[index].list == bh)
485 free_list[index].list = bh->b_next_free;
487 bh->b_next_free = bh->b_prev_free = NULL;
490 /* must be called with both the hash_table_lock and the lru_list_lock
491 held */
492 static void __remove_from_queues(struct buffer_head *bh)
494 __hash_unlink(bh);
495 __remove_from_lru_list(bh, bh->b_list);
498 static void insert_into_queues(struct buffer_head *bh)
500 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
502 spin_lock(&lru_list_lock);
503 write_lock(&hash_table_lock);
504 __hash_link(bh, head);
505 __insert_into_lru_list(bh, bh->b_list);
506 write_unlock(&hash_table_lock);
507 spin_unlock(&lru_list_lock);
510 /* This function must only run if there are no other
511 * references _anywhere_ to this buffer head.
513 static void put_last_free(struct buffer_head * bh)
515 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
516 struct buffer_head **bhp = &head->list;
518 bh->b_state = 0;
520 spin_lock(&head->lock);
521 bh->b_dev = B_FREE;
522 if(!*bhp) {
523 *bhp = bh;
524 bh->b_prev_free = bh;
526 bh->b_next_free = *bhp;
527 bh->b_prev_free = (*bhp)->b_prev_free;
528 (*bhp)->b_prev_free->b_next_free = bh;
529 (*bhp)->b_prev_free = bh;
530 spin_unlock(&head->lock);
534 * Why like this, I hear you say... The reason is race-conditions.
535 * As we don't lock buffers (unless we are reading them, that is),
536 * something might happen to it while we sleep (ie a read-error
537 * will force it bad). This shouldn't really happen currently, but
538 * the code is ready.
540 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
542 struct buffer_head **head = &hash(dev, block);
543 struct buffer_head *bh;
545 read_lock(&hash_table_lock);
546 for(bh = *head; bh; bh = bh->b_next)
547 if (bh->b_blocknr == block &&
548 bh->b_size == size &&
549 bh->b_dev == dev)
550 break;
551 if (bh)
552 atomic_inc(&bh->b_count);
553 read_unlock(&hash_table_lock);
555 return bh;
558 unsigned int get_hardblocksize(kdev_t dev)
561 * Get the hard sector size for the given device. If we don't know
562 * what it is, return 0.
564 if (hardsect_size[MAJOR(dev)] != NULL) {
565 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
566 if (blksize != 0)
567 return blksize;
571 * We don't know what the hardware sector size for this device is.
572 * Return 0 indicating that we don't know.
574 return 0;
577 /* If invalidate_buffers() will trash dirty buffers, it means some kind
578 of fs corruption is going on. Trashing dirty data always imply losing
579 information that was supposed to be just stored on the physical layer
580 by the user.
582 Thus invalidate_buffers in general usage is not allwowed to trash dirty
583 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
585 NOTE: In the case where the user removed a removable-media-disk even if
586 there's still dirty data not synced on disk (due a bug in the device driver
587 or due an error of the user), by not destroying the dirty buffers we could
588 generate corruption also on the next media inserted, thus a parameter is
589 necessary to handle this case in the most safe way possible (trying
590 to not corrupt also the new disk inserted with the data belonging to
591 the old now corrupted disk). Also for the ramdisk the natural thing
592 to do in order to release the ramdisk memory is to destroy dirty buffers.
594 These are two special cases. Normal usage imply the device driver
595 to issue a sync on the device (without waiting I/O completation) and
596 then an invalidate_buffers call that doesn't trashes dirty buffers. */
597 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
599 int i, nlist, slept;
600 struct buffer_head * bh, * bh_next;
602 retry:
603 slept = 0;
604 spin_lock(&lru_list_lock);
605 for(nlist = 0; nlist < NR_LIST; nlist++) {
606 bh = lru_list[nlist];
607 if (!bh)
608 continue;
609 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
610 bh_next = bh->b_next_free;
611 if (bh->b_dev != dev)
612 continue;
613 if (buffer_locked(bh)) {
614 atomic_inc(&bh->b_count);
615 spin_unlock(&lru_list_lock);
616 wait_on_buffer(bh);
617 slept = 1;
618 spin_lock(&lru_list_lock);
619 atomic_dec(&bh->b_count);
622 write_lock(&hash_table_lock);
623 if (!atomic_read(&bh->b_count) &&
624 (destroy_dirty_buffers || !buffer_dirty(bh))) {
625 __remove_from_queues(bh);
626 put_last_free(bh);
628 write_unlock(&hash_table_lock);
629 if (slept)
630 goto out;
633 out:
634 spin_unlock(&lru_list_lock);
635 if (slept)
636 goto retry;
639 void set_blocksize(kdev_t dev, int size)
641 extern int *blksize_size[];
642 int i, nlist, slept;
643 struct buffer_head * bh, * bh_next;
645 if (!blksize_size[MAJOR(dev)])
646 return;
648 /* Size must be a power of two, and between 512 and PAGE_SIZE */
649 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
650 panic("Invalid blocksize passed to set_blocksize");
652 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
653 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
654 return;
656 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
657 return;
658 sync_buffers(dev, 2);
659 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
661 retry:
662 slept = 0;
663 spin_lock(&lru_list_lock);
664 for(nlist = 0; nlist < NR_LIST; nlist++) {
665 bh = lru_list[nlist];
666 if (!bh)
667 continue;
668 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
669 bh_next = bh->b_next_free;
670 if (bh->b_dev != dev || bh->b_size == size)
671 continue;
672 if (buffer_locked(bh)) {
673 atomic_inc(&bh->b_count);
674 spin_unlock(&lru_list_lock);
675 wait_on_buffer(bh);
676 slept = 1;
677 spin_lock(&lru_list_lock);
678 atomic_dec(&bh->b_count);
681 write_lock(&hash_table_lock);
682 if (!atomic_read(&bh->b_count)) {
683 if (buffer_dirty(bh))
684 printk(KERN_WARNING
685 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
686 kdevname(dev), bh->b_blocknr, bh->b_size);
687 __remove_from_queues(bh);
688 put_last_free(bh);
689 } else {
690 if (atomic_set_buffer_clean(bh))
691 __refile_buffer(bh);
692 clear_bit(BH_Uptodate, &bh->b_state);
693 printk(KERN_WARNING
694 "set_blocksize: "
695 "b_count %d, dev %s, block %lu, from %p\n",
696 atomic_read(&bh->b_count), bdevname(bh->b_dev),
697 bh->b_blocknr, __builtin_return_address(0));
699 write_unlock(&hash_table_lock);
700 if (slept)
701 goto out;
704 out:
705 spin_unlock(&lru_list_lock);
706 if (slept)
707 goto retry;
711 * We used to try various strange things. Let's not.
713 static void refill_freelist(int size)
715 if (!grow_buffers(size)) {
716 wakeup_bdflush(1);
717 current->policy |= SCHED_YIELD;
718 schedule();
722 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
724 bh->b_list = BUF_CLEAN;
725 bh->b_end_io = handler;
726 bh->b_dev_id = dev_id;
729 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
731 mark_buffer_uptodate(bh, uptodate);
732 unlock_buffer(bh);
735 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
737 mark_buffer_uptodate(bh, uptodate);
738 unlock_buffer(bh);
739 BUG();
742 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
744 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
745 unsigned long flags;
746 struct buffer_head *tmp;
747 struct page *page;
749 mark_buffer_uptodate(bh, uptodate);
751 /* This is a temporary buffer used for page I/O. */
752 page = bh->b_page;
754 if (!uptodate)
755 SetPageError(page);
758 * Be _very_ careful from here on. Bad things can happen if
759 * two buffer heads end IO at almost the same time and both
760 * decide that the page is now completely done.
762 * Async buffer_heads are here only as labels for IO, and get
763 * thrown away once the IO for this page is complete. IO is
764 * deemed complete once all buffers have been visited
765 * (b_count==0) and are now unlocked. We must make sure that
766 * only the _last_ buffer that decrements its count is the one
767 * that unlock the page..
769 spin_lock_irqsave(&page_uptodate_lock, flags);
770 unlock_buffer(bh);
771 atomic_dec(&bh->b_count);
772 tmp = bh->b_this_page;
773 while (tmp != bh) {
774 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
775 goto still_busy;
776 tmp = tmp->b_this_page;
779 /* OK, the async IO on this page is complete. */
780 spin_unlock_irqrestore(&page_uptodate_lock, flags);
783 * if none of the buffers had errors then we can set the
784 * page uptodate:
786 if (!PageError(page))
787 SetPageUptodate(page);
790 * Run the hooks that have to be done when a page I/O has completed.
792 if (PageTestandClearDecrAfter(page))
793 atomic_dec(&nr_async_pages);
795 UnlockPage(page);
797 return;
799 still_busy:
800 spin_unlock_irqrestore(&page_uptodate_lock, flags);
801 return;
805 * Ok, this is getblk, and it isn't very clear, again to hinder
806 * race-conditions. Most of the code is seldom used, (ie repeating),
807 * so it should be much more efficient than it looks.
809 * The algorithm is changed: hopefully better, and an elusive bug removed.
811 * 14.02.92: changed it to sync dirty buffers a bit: better performance
812 * when the filesystem starts to get full of dirty blocks (I hope).
814 struct buffer_head * getblk(kdev_t dev, int block, int size)
816 struct buffer_head * bh;
817 int isize;
819 repeat:
820 bh = get_hash_table(dev, block, size);
821 if (bh)
822 goto out;
824 isize = BUFSIZE_INDEX(size);
825 spin_lock(&free_list[isize].lock);
826 bh = free_list[isize].list;
827 if (bh) {
828 __remove_from_free_list(bh, isize);
829 atomic_set(&bh->b_count, 1);
831 spin_unlock(&free_list[isize].lock);
834 * OK, FINALLY we know that this buffer is the only one of
835 * its kind, we hold a reference (b_count>0), it is unlocked,
836 * and it is clean.
838 if (bh) {
839 init_buffer(bh, end_buffer_io_sync, NULL);
840 bh->b_dev = dev;
841 bh->b_blocknr = block;
842 bh->b_state = 1 << BH_Mapped;
844 /* Insert the buffer into the regular lists */
845 insert_into_queues(bh);
846 out:
847 touch_buffer(bh);
848 return bh;
852 * If we block while refilling the free list, somebody may
853 * create the buffer first ... search the hashes again.
855 refill_freelist(size);
856 goto repeat;
859 /* -1 -> no need to flush
860 0 -> async flush
861 1 -> sync flush (wait for I/O completation) */
862 static int balance_dirty_state(kdev_t dev)
864 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
866 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
867 tot = nr_free_buffer_pages();
868 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
870 dirty *= 200;
871 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
872 hard_dirty_limit = soft_dirty_limit * 2;
874 if (dirty > soft_dirty_limit) {
875 if (dirty > hard_dirty_limit)
876 return 1;
877 return 0;
879 return -1;
883 * if a new dirty buffer is created we need to balance bdflush.
885 * in the future we might want to make bdflush aware of different
886 * pressures on different devices - thus the (currently unused)
887 * 'dev' parameter.
889 void balance_dirty(kdev_t dev)
891 int state = balance_dirty_state(dev);
893 if (state < 0)
894 return;
895 wakeup_bdflush(state);
898 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
900 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
901 refile_buffer(bh);
904 /* atomic version, the user must call balance_dirty() by hand
905 as soon as it become possible to block */
906 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
908 if (!atomic_set_buffer_dirty(bh))
909 __mark_dirty(bh, flag);
912 void mark_buffer_dirty(struct buffer_head *bh, int flag)
914 __mark_buffer_dirty(bh, flag);
915 balance_dirty(bh->b_dev);
919 * A buffer may need to be moved from one buffer list to another
920 * (e.g. in case it is not shared any more). Handle this.
922 static void __refile_buffer(struct buffer_head *bh)
924 int dispose = BUF_CLEAN;
925 if (buffer_locked(bh))
926 dispose = BUF_LOCKED;
927 if (buffer_dirty(bh))
928 dispose = BUF_DIRTY;
929 if (buffer_protected(bh))
930 dispose = BUF_PROTECTED;
931 if (dispose != bh->b_list) {
932 __remove_from_lru_list(bh, bh->b_list);
933 bh->b_list = dispose;
934 __insert_into_lru_list(bh, dispose);
938 void refile_buffer(struct buffer_head *bh)
940 spin_lock(&lru_list_lock);
941 __refile_buffer(bh);
942 spin_unlock(&lru_list_lock);
946 * Release a buffer head
948 void __brelse(struct buffer_head * buf)
950 if (atomic_read(&buf->b_count)) {
951 atomic_dec(&buf->b_count);
952 return;
954 printk("VFS: brelse: Trying to free free buffer\n");
958 * bforget() is like brelse(), except it puts the buffer on the
959 * free list if it can.. We can NOT free the buffer if:
960 * - there are other users of it
961 * - it is locked and thus can have active IO
963 void __bforget(struct buffer_head * buf)
965 /* grab the lru lock here to block bdflush. */
966 spin_lock(&lru_list_lock);
967 write_lock(&hash_table_lock);
968 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
969 goto in_use;
970 __hash_unlink(buf);
971 write_unlock(&hash_table_lock);
972 __remove_from_lru_list(buf, buf->b_list);
973 spin_unlock(&lru_list_lock);
974 put_last_free(buf);
975 return;
977 in_use:
978 write_unlock(&hash_table_lock);
979 spin_unlock(&lru_list_lock);
983 * bread() reads a specified block and returns the buffer that contains
984 * it. It returns NULL if the block was unreadable.
986 struct buffer_head * bread(kdev_t dev, int block, int size)
988 struct buffer_head * bh;
990 bh = getblk(dev, block, size);
991 if (buffer_uptodate(bh))
992 return bh;
993 ll_rw_block(READ, 1, &bh);
994 wait_on_buffer(bh);
995 if (buffer_uptodate(bh))
996 return bh;
997 brelse(bh);
998 return NULL;
1002 * Ok, breada can be used as bread, but additionally to mark other
1003 * blocks for reading as well. End the argument list with a negative
1004 * number.
1007 #define NBUF 16
1009 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1010 unsigned int pos, unsigned int filesize)
1012 struct buffer_head * bhlist[NBUF];
1013 unsigned int blocks;
1014 struct buffer_head * bh;
1015 int index;
1016 int i, j;
1018 if (pos >= filesize)
1019 return NULL;
1021 if (block < 0)
1022 return NULL;
1024 bh = getblk(dev, block, bufsize);
1025 index = BUFSIZE_INDEX(bh->b_size);
1027 if (buffer_uptodate(bh))
1028 return(bh);
1029 else ll_rw_block(READ, 1, &bh);
1031 blocks = (filesize - pos) >> (9+index);
1033 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1034 blocks = read_ahead[MAJOR(dev)] >> index;
1035 if (blocks > NBUF)
1036 blocks = NBUF;
1038 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1040 bhlist[0] = bh;
1041 j = 1;
1042 for(i=1; i<blocks; i++) {
1043 bh = getblk(dev,block+i,bufsize);
1044 if (buffer_uptodate(bh)) {
1045 brelse(bh);
1046 break;
1048 else bhlist[j++] = bh;
1051 /* Request the read for these buffers, and then release them. */
1052 if (j>1)
1053 ll_rw_block(READA, (j-1), bhlist+1);
1054 for(i=1; i<j; i++)
1055 brelse(bhlist[i]);
1057 /* Wait for this buffer, and then continue on. */
1058 bh = bhlist[0];
1059 wait_on_buffer(bh);
1060 if (buffer_uptodate(bh))
1061 return bh;
1062 brelse(bh);
1063 return NULL;
1067 * Note: the caller should wake up the buffer_wait list if needed.
1069 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1071 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1072 kmem_cache_free(bh_cachep, bh);
1073 } else {
1074 bh->b_blocknr = -1;
1075 init_waitqueue_head(&bh->b_wait);
1076 nr_unused_buffer_heads++;
1077 bh->b_next_free = unused_list;
1078 bh->b_this_page = NULL;
1079 unused_list = bh;
1084 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1085 * no-buffer-head deadlock. Return NULL on failure; waiting for
1086 * buffer heads is now handled in create_buffers().
1088 static struct buffer_head * get_unused_buffer_head(int async)
1090 struct buffer_head * bh;
1092 spin_lock(&unused_list_lock);
1093 if (nr_unused_buffer_heads > NR_RESERVED) {
1094 bh = unused_list;
1095 unused_list = bh->b_next_free;
1096 nr_unused_buffer_heads--;
1097 spin_unlock(&unused_list_lock);
1098 return bh;
1100 spin_unlock(&unused_list_lock);
1102 /* This is critical. We can't swap out pages to get
1103 * more buffer heads, because the swap-out may need
1104 * more buffer-heads itself. Thus SLAB_BUFFER.
1106 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1107 memset(bh, 0, sizeof(*bh));
1108 init_waitqueue_head(&bh->b_wait);
1109 return bh;
1113 * If we need an async buffer, use the reserved buffer heads.
1115 if (async) {
1116 spin_lock(&unused_list_lock);
1117 if (unused_list) {
1118 bh = unused_list;
1119 unused_list = bh->b_next_free;
1120 nr_unused_buffer_heads--;
1121 spin_unlock(&unused_list_lock);
1122 return bh;
1124 spin_unlock(&unused_list_lock);
1126 #if 0
1128 * (Pending further analysis ...)
1129 * Ordinary (non-async) requests can use a different memory priority
1130 * to free up pages. Any swapping thus generated will use async
1131 * buffer heads.
1133 if(!async &&
1134 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1135 memset(bh, 0, sizeof(*bh));
1136 init_waitqueue_head(&bh->b_wait);
1137 return bh;
1139 #endif
1141 return NULL;
1144 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1146 bh->b_page = page;
1147 if (offset >= PAGE_SIZE)
1148 BUG();
1149 if (PageHighMem(page))
1151 * This catches illegal uses and preserves the offset:
1153 bh->b_data = (char *)(0 + offset);
1154 else
1155 bh->b_data = (char *)(page_address(page) + offset);
1159 * Create the appropriate buffers when given a page for data area and
1160 * the size of each buffer.. Use the bh->b_this_page linked list to
1161 * follow the buffers created. Return NULL if unable to create more
1162 * buffers.
1163 * The async flag is used to differentiate async IO (paging, swapping)
1164 * from ordinary buffer allocations, and only async requests are allowed
1165 * to sleep waiting for buffer heads.
1167 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1169 struct buffer_head *bh, *head;
1170 long offset;
1172 try_again:
1173 head = NULL;
1174 offset = PAGE_SIZE;
1175 while ((offset -= size) >= 0) {
1176 bh = get_unused_buffer_head(async);
1177 if (!bh)
1178 goto no_grow;
1180 bh->b_dev = B_FREE; /* Flag as unused */
1181 bh->b_this_page = head;
1182 head = bh;
1184 bh->b_state = 0;
1185 bh->b_next_free = NULL;
1186 bh->b_pprev = NULL;
1187 atomic_set(&bh->b_count, 0);
1188 bh->b_size = size;
1190 set_bh_page(bh, page, offset);
1192 bh->b_list = BUF_CLEAN;
1193 bh->b_end_io = end_buffer_io_bad;
1195 return head;
1197 * In case anything failed, we just free everything we got.
1199 no_grow:
1200 if (head) {
1201 spin_lock(&unused_list_lock);
1202 do {
1203 bh = head;
1204 head = head->b_this_page;
1205 __put_unused_buffer_head(bh);
1206 } while (head);
1207 spin_unlock(&unused_list_lock);
1209 /* Wake up any waiters ... */
1210 wake_up(&buffer_wait);
1214 * Return failure for non-async IO requests. Async IO requests
1215 * are not allowed to fail, so we have to wait until buffer heads
1216 * become available. But we don't want tasks sleeping with
1217 * partially complete buffers, so all were released above.
1219 if (!async)
1220 return NULL;
1222 /* We're _really_ low on memory. Now we just
1223 * wait for old buffer heads to become free due to
1224 * finishing IO. Since this is an async request and
1225 * the reserve list is empty, we're sure there are
1226 * async buffer heads in use.
1228 run_task_queue(&tq_disk);
1231 * Set our state for sleeping, then check again for buffer heads.
1232 * This ensures we won't miss a wake_up from an interrupt.
1234 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1235 goto try_again;
1238 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1240 struct buffer_head *head, *bh, *tail;
1241 int block;
1243 if (!PageLocked(page))
1244 BUG();
1246 * Allocate async buffer heads pointing to this page, just for I/O.
1247 * They don't show up in the buffer hash table, but they *are*
1248 * registered in page->buffers.
1250 head = create_buffers(page, size, 1);
1251 if (page->buffers)
1252 BUG();
1253 if (!head)
1254 BUG();
1255 tail = head;
1256 for (bh = head; bh; bh = bh->b_this_page) {
1257 block = *(b++);
1259 tail = bh;
1260 init_buffer(bh, end_buffer_io_async, NULL);
1261 bh->b_dev = dev;
1262 bh->b_blocknr = block;
1264 set_bit(BH_Mapped, &bh->b_state);
1266 tail->b_this_page = head;
1267 get_page(page);
1268 page->buffers = head;
1269 return 0;
1272 static void unmap_buffer(struct buffer_head * bh)
1274 if (buffer_mapped(bh)) {
1275 mark_buffer_clean(bh);
1276 wait_on_buffer(bh);
1277 clear_bit(BH_Uptodate, &bh->b_state);
1278 clear_bit(BH_Mapped, &bh->b_state);
1279 clear_bit(BH_Req, &bh->b_state);
1280 clear_bit(BH_New, &bh->b_state);
1285 * We don't have to release all buffers here, but
1286 * we have to be sure that no dirty buffer is left
1287 * and no IO is going on (no buffer is locked), because
1288 * we have truncated the file and are going to free the
1289 * blocks on-disk..
1291 int block_flushpage(struct page *page, unsigned long offset)
1293 struct buffer_head *head, *bh, *next;
1294 unsigned int curr_off = 0;
1296 if (!PageLocked(page))
1297 BUG();
1298 if (!page->buffers)
1299 return 1;
1301 head = page->buffers;
1302 bh = head;
1303 do {
1304 unsigned int next_off = curr_off + bh->b_size;
1305 next = bh->b_this_page;
1308 * is this block fully flushed?
1310 if (offset <= curr_off)
1311 unmap_buffer(bh);
1312 curr_off = next_off;
1313 bh = next;
1314 } while (bh != head);
1317 * subtle. We release buffer-heads only if this is
1318 * the 'final' flushpage. We have invalidated the get_block
1319 * cached value unconditionally, so real IO is not
1320 * possible anymore.
1322 * If the free doesn't work out, the buffers can be
1323 * left around - they just turn into anonymous buffers
1324 * instead.
1326 if (!offset) {
1327 if (!try_to_free_buffers(page)) {
1328 atomic_inc(&buffermem_pages);
1329 return 0;
1333 return 1;
1336 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1338 struct buffer_head *bh, *head, *tail;
1340 head = create_buffers(page, blocksize, 1);
1341 if (page->buffers)
1342 BUG();
1344 bh = head;
1345 do {
1346 bh->b_dev = inode->i_dev;
1347 bh->b_blocknr = 0;
1348 bh->b_end_io = end_buffer_io_bad;
1349 tail = bh;
1350 bh = bh->b_this_page;
1351 } while (bh);
1352 tail->b_this_page = head;
1353 page->buffers = head;
1354 get_page(page);
1357 static void unmap_underlying_metadata(struct buffer_head * bh)
1359 struct buffer_head *old_bh;
1361 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1362 if (old_bh) {
1363 unmap_buffer(old_bh);
1364 /* Here we could run brelse or bforget. We use
1365 bforget because it will try to put the buffer
1366 in the freelist. */
1367 __bforget(old_bh);
1372 * block_write_full_page() is SMP-safe - currently it's still
1373 * being called with the kernel lock held, but the code is ready.
1375 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1377 int err, i, need_balance_dirty = 0;
1378 unsigned long block;
1379 struct buffer_head *bh, *head;
1381 if (!PageLocked(page))
1382 BUG();
1384 if (!page->buffers)
1385 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1386 head = page->buffers;
1388 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1390 bh = head;
1391 i = 0;
1392 do {
1394 * If the buffer isn't up-to-date, we can't be sure
1395 * that the buffer has been initialized with the proper
1396 * block number information etc..
1398 * Leave it to the low-level FS to make all those
1399 * decisions (block #0 may actually be a valid block)
1401 bh->b_end_io = end_buffer_io_sync;
1402 if (!buffer_mapped(bh)) {
1403 err = get_block(inode, block, bh, 1);
1404 if (err)
1405 goto out;
1406 if (buffer_new(bh))
1407 unmap_underlying_metadata(bh);
1409 set_bit(BH_Uptodate, &bh->b_state);
1410 if (!atomic_set_buffer_dirty(bh)) {
1411 __mark_dirty(bh, 0);
1412 need_balance_dirty = 1;
1415 bh = bh->b_this_page;
1416 block++;
1417 } while (bh != head);
1419 if (need_balance_dirty)
1420 balance_dirty(bh->b_dev);
1422 SetPageUptodate(page);
1423 return 0;
1424 out:
1425 ClearPageUptodate(page);
1426 return err;
1429 static int __block_prepare_write(struct inode *inode, struct page *page,
1430 unsigned from, unsigned to, get_block_t *get_block)
1432 unsigned block_start, block_end;
1433 unsigned long block;
1434 int err = 0;
1435 unsigned blocksize, bbits;
1436 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1437 char *kaddr = (char *)kmap(page);
1439 blocksize = inode->i_sb->s_blocksize;
1440 if (!page->buffers)
1441 create_empty_buffers(page, inode, blocksize);
1442 head = page->buffers;
1444 bbits = inode->i_sb->s_blocksize_bits;
1445 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1447 for(bh = head, block_start = 0; bh != head || !block_start;
1448 block++, block_start=block_end, bh = bh->b_this_page) {
1449 if (!bh)
1450 BUG();
1451 block_end = block_start+blocksize;
1452 if (block_end <= from)
1453 continue;
1454 if (block_start >= to)
1455 break;
1456 bh->b_end_io = end_buffer_io_sync;
1457 if (!buffer_mapped(bh)) {
1458 err = get_block(inode, block, bh, 1);
1459 if (err)
1460 goto out;
1461 if (buffer_new(bh)) {
1462 unmap_underlying_metadata(bh);
1463 if (block_end > to)
1464 memset(kaddr+to, 0, block_end-to);
1465 if (block_start < from)
1466 memset(kaddr+block_start, 0, from-block_start);
1467 continue;
1470 if (!buffer_uptodate(bh) &&
1471 (block_start < from || block_end > to)) {
1472 ll_rw_block(READ, 1, &bh);
1473 *wait_bh++=bh;
1477 * If we issued read requests - let them complete.
1479 while(wait_bh > wait) {
1480 wait_on_buffer(*--wait_bh);
1481 err = -EIO;
1482 if (!buffer_uptodate(*wait_bh))
1483 goto out;
1485 return 0;
1486 out:
1487 return err;
1490 static int __block_commit_write(struct inode *inode, struct page *page,
1491 unsigned from, unsigned to)
1493 unsigned block_start, block_end;
1494 int partial = 0, need_balance_dirty = 0;
1495 unsigned blocksize;
1496 struct buffer_head *bh, *head;
1498 blocksize = inode->i_sb->s_blocksize;
1500 for(bh = head = page->buffers, block_start = 0;
1501 bh != head || !block_start;
1502 block_start=block_end, bh = bh->b_this_page) {
1503 block_end = block_start + blocksize;
1504 if (block_end <= from || block_start >= to) {
1505 if (!buffer_uptodate(bh))
1506 partial = 1;
1507 } else {
1508 set_bit(BH_Uptodate, &bh->b_state);
1509 if (!atomic_set_buffer_dirty(bh)) {
1510 __mark_dirty(bh, 0);
1511 need_balance_dirty = 1;
1516 if (need_balance_dirty)
1517 balance_dirty(bh->b_dev);
1519 * is this a partial write that happened to make all buffers
1520 * uptodate then we can optimize away a bogus readpage() for
1521 * the next read(). Here we 'discover' wether the page went
1522 * uptodate as a result of this (potentially partial) write.
1524 if (!partial)
1525 SetPageUptodate(page);
1526 return 0;
1530 * Generic "read page" function for block devices that have the normal
1531 * get_block functionality. This is most of the block device filesystems.
1532 * Reads the page asynchronously --- the unlock_buffer() and
1533 * mark_buffer_uptodate() functions propagate buffer state into the
1534 * page struct once IO has completed.
1536 int block_read_full_page(struct page *page, get_block_t *get_block)
1538 struct inode *inode = (struct inode*)page->mapping->host;
1539 unsigned long iblock, lblock;
1540 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1541 unsigned int blocksize, blocks;
1542 unsigned long kaddr = 0;
1543 int nr, i;
1545 if (!PageLocked(page))
1546 PAGE_BUG(page);
1547 blocksize = inode->i_sb->s_blocksize;
1548 if (!page->buffers)
1549 create_empty_buffers(page, inode, blocksize);
1550 head = page->buffers;
1552 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1553 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1554 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1555 bh = head;
1556 nr = 0;
1557 i = 0;
1559 do {
1560 if (buffer_uptodate(bh))
1561 continue;
1563 if (!buffer_mapped(bh)) {
1564 if (iblock < lblock)
1565 get_block(inode, iblock, bh, 0);
1566 if (!buffer_mapped(bh)) {
1567 if (!kaddr)
1568 kaddr = kmap(page);
1569 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1570 set_bit(BH_Uptodate, &bh->b_state);
1571 continue;
1575 init_buffer(bh, end_buffer_io_async, NULL);
1576 atomic_inc(&bh->b_count);
1577 arr[nr] = bh;
1578 nr++;
1579 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1581 if (nr) {
1582 if (Page_Uptodate(page))
1583 BUG();
1584 ll_rw_block(READ, nr, arr);
1585 } else {
1587 * all buffers are uptodate - we can set the page
1588 * uptodate as well.
1590 SetPageUptodate(page);
1591 UnlockPage(page);
1593 if (kaddr)
1594 kunmap(page);
1595 return 0;
1599 * For moronic filesystems that do not allow holes in file.
1600 * We may have to extend the file.
1603 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1605 struct address_space *mapping = page->mapping;
1606 struct inode *inode = (struct inode*)mapping->host;
1607 struct page *new_page;
1608 unsigned long pgpos;
1609 long status;
1610 unsigned zerofrom;
1611 unsigned blocksize = inode->i_sb->s_blocksize;
1612 char *kaddr;
1614 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1615 status = -ENOMEM;
1616 new_page = grab_cache_page(mapping, pgpos);
1617 if (!new_page)
1618 goto out;
1619 /* we might sleep */
1620 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1621 UnlockPage(new_page);
1622 page_cache_release(new_page);
1623 continue;
1625 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1626 if (zerofrom & (blocksize-1)) {
1627 *bytes |= (blocksize-1);
1628 (*bytes)++;
1630 status = __block_prepare_write(inode, new_page, zerofrom,
1631 PAGE_CACHE_SIZE, get_block);
1632 if (status)
1633 goto out_unmap;
1634 kaddr = (char*)page_address(page);
1635 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1636 __block_commit_write(inode, new_page, zerofrom, to);
1637 kunmap(new_page);
1638 UnlockPage(new_page);
1639 page_cache_release(new_page);
1642 if (page->index < pgpos) {
1643 /* completely inside the area */
1644 zerofrom = offset;
1645 } else {
1646 /* page covers the boundary, find the boundary offset */
1647 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1649 /* if we will expand the thing last block will be filled */
1650 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1651 *bytes |= (blocksize-1);
1652 (*bytes)++;
1655 /* starting below the boundary? Nothing to zero out */
1656 if (offset <= zerofrom)
1657 zerofrom = offset;
1659 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1660 if (status)
1661 goto out1;
1662 kaddr = (char*)page_address(page);
1663 if (zerofrom < offset) {
1664 memset(kaddr+zerofrom, 0, offset-zerofrom);
1665 __block_commit_write(inode, page, zerofrom, offset);
1667 return 0;
1668 out1:
1669 ClearPageUptodate(page);
1670 kunmap(page);
1671 return status;
1673 out_unmap:
1674 ClearPageUptodate(new_page);
1675 kunmap(new_page);
1676 UnlockPage(new_page);
1677 page_cache_release(new_page);
1678 out:
1679 return status;
1682 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1683 get_block_t *get_block)
1685 struct inode *inode = (struct inode*)page->mapping->host;
1686 int err = __block_prepare_write(inode, page, from, to, get_block);
1687 if (err) {
1688 ClearPageUptodate(page);
1689 kunmap(page);
1691 return err;
1694 int generic_commit_write(struct file *file, struct page *page,
1695 unsigned from, unsigned to)
1697 struct inode *inode = (struct inode*)page->mapping->host;
1698 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1699 __block_commit_write(inode,page,from,to);
1700 kunmap(page);
1701 if (pos > inode->i_size)
1702 inode->i_size = pos;
1703 return 0;
1706 int block_write_full_page(struct page *page, get_block_t *get_block)
1708 struct inode *inode = (struct inode*)page->mapping->host;
1709 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1710 unsigned offset;
1711 int err;
1713 /* easy case */
1714 if (page->index < end_index)
1715 return __block_write_full_page(inode, page, get_block);
1717 /* things got complicated... */
1718 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1719 /* OK, are we completely out? */
1720 if (page->index >= end_index+1 || !offset)
1721 return -EIO;
1722 /* Sigh... will have to work, then... */
1723 err = __block_prepare_write(inode, page, 0, offset, get_block);
1724 if (!err) {
1725 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1726 __block_commit_write(inode,page,0,offset);
1727 done:
1728 kunmap(page);
1729 return err;
1731 ClearPageUptodate(page);
1732 goto done;
1735 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1737 struct buffer_head tmp;
1738 struct inode *inode = (struct inode*)mapping->host;
1739 tmp.b_state = 0;
1740 tmp.b_blocknr = 0;
1741 get_block(inode, block, &tmp, 0);
1742 return tmp.b_blocknr;
1746 * IO completion routine for a buffer_head being used for kiobuf IO: we
1747 * can't dispatch the kiobuf callback until io_count reaches 0.
1750 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1752 struct kiobuf *kiobuf;
1754 mark_buffer_uptodate(bh, uptodate);
1756 kiobuf = bh->b_kiobuf;
1757 unlock_buffer(bh);
1758 end_kio_request(kiobuf, uptodate);
1763 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1764 * for them to complete. Clean up the buffer_heads afterwards.
1767 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1769 int iosize;
1770 int i;
1771 struct buffer_head *tmp;
1773 struct task_struct *tsk = current;
1774 DECLARE_WAITQUEUE(wait, tsk);
1776 if (rw == WRITE)
1777 rw = WRITERAW;
1778 ll_rw_block(rw, nr, bh);
1780 iosize = 0;
1781 spin_lock(&unused_list_lock);
1783 for (i = nr; --i >= 0; ) {
1784 iosize += size;
1785 tmp = bh[i];
1786 if (buffer_locked(tmp)) {
1787 spin_unlock(&unused_list_lock);
1788 wait_on_buffer(tmp);
1789 spin_lock(&unused_list_lock);
1792 if (!buffer_uptodate(tmp)) {
1793 /* We are traversing bh'es in reverse order so
1794 clearing iosize on error calculates the
1795 amount of IO before the first error. */
1796 iosize = 0;
1798 __put_unused_buffer_head(tmp);
1801 spin_unlock(&unused_list_lock);
1803 return iosize;
1807 * Start I/O on a physical range of kernel memory, defined by a vector
1808 * of kiobuf structs (much like a user-space iovec list).
1810 * The kiobuf must already be locked for IO. IO is submitted
1811 * asynchronously: you need to check page->locked, page->uptodate, and
1812 * maybe wait on page->wait.
1814 * It is up to the caller to make sure that there are enough blocks
1815 * passed in to completely map the iobufs to disk.
1818 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1819 kdev_t dev, unsigned long b[], int size)
1821 int err;
1822 int length;
1823 int transferred;
1824 int i;
1825 int bufind;
1826 int pageind;
1827 int bhind;
1828 int offset;
1829 unsigned long blocknr;
1830 struct kiobuf * iobuf = NULL;
1831 struct page * map;
1832 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1834 if (!nr)
1835 return 0;
1838 * First, do some alignment and validity checks
1840 for (i = 0; i < nr; i++) {
1841 iobuf = iovec[i];
1842 if ((iobuf->offset & (size-1)) ||
1843 (iobuf->length & (size-1)))
1844 return -EINVAL;
1845 if (!iobuf->nr_pages)
1846 panic("brw_kiovec: iobuf not initialised");
1850 * OK to walk down the iovec doing page IO on each page we find.
1852 bufind = bhind = transferred = err = 0;
1853 for (i = 0; i < nr; i++) {
1854 iobuf = iovec[i];
1855 offset = iobuf->offset;
1856 length = iobuf->length;
1857 iobuf->errno = 0;
1859 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1860 map = iobuf->maplist[pageind];
1861 if (!map) {
1862 err = -EFAULT;
1863 goto error;
1866 while (length > 0) {
1867 blocknr = b[bufind++];
1868 tmp = get_unused_buffer_head(0);
1869 if (!tmp) {
1870 err = -ENOMEM;
1871 goto error;
1874 tmp->b_dev = B_FREE;
1875 tmp->b_size = size;
1876 set_bh_page(tmp, map, offset);
1877 tmp->b_this_page = tmp;
1879 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1880 tmp->b_dev = dev;
1881 tmp->b_blocknr = blocknr;
1882 tmp->b_state = 1 << BH_Mapped;
1883 tmp->b_kiobuf = iobuf;
1885 if (rw == WRITE) {
1886 set_bit(BH_Uptodate, &tmp->b_state);
1887 set_bit(BH_Dirty, &tmp->b_state);
1890 bh[bhind++] = tmp;
1891 length -= size;
1892 offset += size;
1894 atomic_inc(&iobuf->io_count);
1897 * Start the IO if we have got too much
1899 if (bhind >= KIO_MAX_SECTORS) {
1900 err = do_kio(rw, bhind, bh, size);
1901 if (err >= 0)
1902 transferred += err;
1903 else
1904 goto finished;
1905 bhind = 0;
1908 if (offset >= PAGE_SIZE) {
1909 offset = 0;
1910 break;
1912 } /* End of block loop */
1913 } /* End of page loop */
1914 } /* End of iovec loop */
1916 /* Is there any IO still left to submit? */
1917 if (bhind) {
1918 err = do_kio(rw, bhind, bh, size);
1919 if (err >= 0)
1920 transferred += err;
1921 else
1922 goto finished;
1925 finished:
1926 if (transferred)
1927 return transferred;
1928 return err;
1930 error:
1931 /* We got an error allocating the bh'es. Just free the current
1932 buffer_heads and exit. */
1933 spin_lock(&unused_list_lock);
1934 for (i = bhind; --i >= 0; ) {
1935 __put_unused_buffer_head(bh[bhind]);
1937 spin_unlock(&unused_list_lock);
1938 goto finished;
1942 * Start I/O on a page.
1943 * This function expects the page to be locked and may return
1944 * before I/O is complete. You then have to check page->locked,
1945 * page->uptodate, and maybe wait on page->wait.
1947 * brw_page() is SMP-safe, although it's being called with the
1948 * kernel lock held - but the code is ready.
1950 * FIXME: we need a swapper_inode->get_block function to remove
1951 * some of the bmap kludges and interface ugliness here.
1953 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1955 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1956 int nr, fresh /* temporary debugging flag */, block;
1958 if (!PageLocked(page))
1959 panic("brw_page: page not locked for I/O");
1960 // ClearPageError(page);
1962 * We pretty much rely on the page lock for this, because
1963 * create_page_buffers() might sleep.
1965 fresh = 0;
1966 if (!page->buffers) {
1967 create_page_buffers(rw, page, dev, b, size);
1968 fresh = 1;
1970 if (!page->buffers)
1971 BUG();
1973 head = page->buffers;
1974 bh = head;
1975 nr = 0;
1976 do {
1977 block = *(b++);
1979 if (fresh && (atomic_read(&bh->b_count) != 0))
1980 BUG();
1981 if (rw == READ) {
1982 if (!fresh)
1983 BUG();
1984 if (!buffer_uptodate(bh)) {
1985 arr[nr++] = bh;
1986 atomic_inc(&bh->b_count);
1988 } else { /* WRITE */
1989 if (!bh->b_blocknr) {
1990 if (!block)
1991 BUG();
1992 bh->b_blocknr = block;
1993 } else {
1994 if (!block)
1995 BUG();
1997 set_bit(BH_Uptodate, &bh->b_state);
1998 set_bit(BH_Dirty, &bh->b_state);
1999 arr[nr++] = bh;
2000 atomic_inc(&bh->b_count);
2002 bh = bh->b_this_page;
2003 } while (bh != head);
2004 if ((rw == READ) && nr) {
2005 if (Page_Uptodate(page))
2006 BUG();
2007 ll_rw_block(rw, nr, arr);
2008 } else {
2009 if (!nr && rw == READ) {
2010 SetPageUptodate(page);
2011 UnlockPage(page);
2013 if (nr && (rw == WRITE))
2014 ll_rw_block(rw, nr, arr);
2016 return 0;
2019 int block_symlink(struct inode *inode, const char *symname, int len)
2021 struct address_space *mapping = inode->i_mapping;
2022 struct page *page = grab_cache_page(mapping, 0);
2023 int err = -ENOMEM;
2024 char *kaddr;
2026 if (!page)
2027 goto fail;
2028 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2029 if (err)
2030 goto fail_map;
2031 kaddr = (char*)page_address(page);
2032 memcpy(kaddr, symname, len-1);
2033 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2035 * Notice that we are _not_ going to block here - end of page is
2036 * unmapped, so this will only try to map the rest of page, see
2037 * that it is unmapped (typically even will not look into inode -
2038 * ->i_size will be enough for everything) and zero it out.
2039 * OTOH it's obviously correct and should make the page up-to-date.
2041 err = mapping->a_ops->readpage(NULL, page);
2042 wait_on_page(page);
2043 page_cache_release(page);
2044 if (err < 0)
2045 goto fail;
2046 mark_inode_dirty(inode);
2047 return 0;
2048 fail_map:
2049 UnlockPage(page);
2050 page_cache_release(page);
2051 fail:
2052 return err;
2056 * Try to increase the number of buffers available: the size argument
2057 * is used to determine what kind of buffers we want.
2059 static int grow_buffers(int size)
2061 struct page * page;
2062 struct buffer_head *bh, *tmp;
2063 struct buffer_head * insert_point;
2064 int isize;
2066 if ((size & 511) || (size > PAGE_SIZE)) {
2067 printk("VFS: grow_buffers: size = %d\n",size);
2068 return 0;
2071 page = alloc_page(GFP_BUFFER);
2072 if (!page)
2073 goto out;
2074 bh = create_buffers(page, size, 0);
2075 if (!bh)
2076 goto no_buffer_head;
2078 isize = BUFSIZE_INDEX(size);
2080 spin_lock(&free_list[isize].lock);
2081 insert_point = free_list[isize].list;
2082 tmp = bh;
2083 while (1) {
2084 if (insert_point) {
2085 tmp->b_next_free = insert_point->b_next_free;
2086 tmp->b_prev_free = insert_point;
2087 insert_point->b_next_free->b_prev_free = tmp;
2088 insert_point->b_next_free = tmp;
2089 } else {
2090 tmp->b_prev_free = tmp;
2091 tmp->b_next_free = tmp;
2093 insert_point = tmp;
2094 if (tmp->b_this_page)
2095 tmp = tmp->b_this_page;
2096 else
2097 break;
2099 tmp->b_this_page = bh;
2100 free_list[isize].list = bh;
2101 spin_unlock(&free_list[isize].lock);
2103 page->buffers = bh;
2104 lru_cache_add(page);
2105 atomic_inc(&buffermem_pages);
2106 return 1;
2108 no_buffer_head:
2109 __free_page(page);
2110 out:
2111 return 0;
2115 * Sync all the buffers on one page..
2117 * If we have old buffers that are locked, we'll
2118 * wait on them, but we won't wait on the new ones
2119 * we're writing out now.
2121 * This all is required so that we can free up memory
2122 * later.
2124 static void sync_page_buffers(struct buffer_head *bh)
2126 struct buffer_head * tmp;
2128 tmp = bh;
2129 do {
2130 struct buffer_head *p = tmp;
2131 tmp = tmp->b_this_page;
2132 if (buffer_locked(p))
2133 __wait_on_buffer(p);
2134 else if (buffer_dirty(p))
2135 ll_rw_block(WRITE, 1, &p);
2136 } while (tmp != bh);
2140 * Can the buffer be thrown out?
2142 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2143 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2146 * try_to_free_buffers() checks if all the buffers on this particular page
2147 * are unused, and free's the page if so.
2149 * Wake up bdflush() if this fails - if we're running low on memory due
2150 * to dirty buffers, we need to flush them out as quickly as possible.
2152 * NOTE: There are quite a number of ways that threads of control can
2153 * obtain a reference to a buffer head within a page. So we must
2154 * lock out all of these paths to cleanly toss the page.
2156 int try_to_free_buffers(struct page * page)
2158 struct buffer_head * tmp, * bh = page->buffers;
2159 int index = BUFSIZE_INDEX(bh->b_size);
2161 spin_lock(&lru_list_lock);
2162 write_lock(&hash_table_lock);
2163 spin_lock(&free_list[index].lock);
2164 tmp = bh;
2165 do {
2166 struct buffer_head *p = tmp;
2168 tmp = tmp->b_this_page;
2169 if (buffer_busy(p))
2170 goto busy_buffer_page;
2171 } while (tmp != bh);
2173 spin_lock(&unused_list_lock);
2174 tmp = bh;
2175 do {
2176 struct buffer_head * p = tmp;
2177 tmp = tmp->b_this_page;
2179 /* The buffer can be either on the regular
2180 * queues or on the free list..
2182 if (p->b_dev != B_FREE)
2183 __remove_from_queues(p);
2184 else
2185 __remove_from_free_list(p, index);
2186 __put_unused_buffer_head(p);
2187 } while (tmp != bh);
2188 spin_unlock(&unused_list_lock);
2190 /* Wake up anyone waiting for buffer heads */
2191 wake_up(&buffer_wait);
2193 /* And free the page */
2194 page->buffers = NULL;
2195 __free_page(page);
2196 spin_unlock(&free_list[index].lock);
2197 write_unlock(&hash_table_lock);
2198 spin_unlock(&lru_list_lock);
2199 return 1;
2201 busy_buffer_page:
2202 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2203 spin_unlock(&free_list[index].lock);
2204 write_unlock(&hash_table_lock);
2205 spin_unlock(&lru_list_lock);
2206 sync_page_buffers(bh);
2207 return 0;
2210 /* ================== Debugging =================== */
2212 void show_buffers(void)
2214 #ifdef CONFIG_SMP
2215 struct buffer_head * bh;
2216 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2217 int protected = 0;
2218 int nlist;
2219 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2220 #endif
2222 printk("Buffer memory: %6dkB\n",
2223 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2225 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2226 if (!spin_trylock(&lru_list_lock))
2227 return;
2228 for(nlist = 0; nlist < NR_LIST; nlist++) {
2229 found = locked = dirty = used = lastused = protected = 0;
2230 bh = lru_list[nlist];
2231 if(!bh) continue;
2233 do {
2234 found++;
2235 if (buffer_locked(bh))
2236 locked++;
2237 if (buffer_protected(bh))
2238 protected++;
2239 if (buffer_dirty(bh))
2240 dirty++;
2241 if (atomic_read(&bh->b_count))
2242 used++, lastused = found;
2243 bh = bh->b_next_free;
2244 } while (bh != lru_list[nlist]);
2246 int tmp = nr_buffers_type[nlist];
2247 if (found != tmp)
2248 printk("%9s: BUG -> found %d, reported %d\n",
2249 buf_types[nlist], found, tmp);
2251 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2252 "%d locked, %d protected, %d dirty\n",
2253 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2254 used, lastused, locked, protected, dirty);
2256 spin_unlock(&lru_list_lock);
2257 #endif
2260 /* ===================== Init ======================= */
2263 * allocate the hash table and init the free list
2264 * Use gfp() for the hash table to decrease TLB misses, use
2265 * SLAB cache for buffer heads.
2267 void __init buffer_init(unsigned long mempages)
2269 int order, i;
2270 unsigned int nr_hash;
2272 /* The buffer cache hash table is less important these days,
2273 * trim it a bit.
2275 mempages >>= 14;
2277 mempages *= sizeof(struct buffer_head *);
2279 for (order = 0; (1 << order) < mempages; order++)
2282 /* try to allocate something until we get it or we're asking
2283 for something that is really too small */
2285 do {
2286 unsigned long tmp;
2288 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2289 bh_hash_mask = (nr_hash - 1);
2291 tmp = nr_hash;
2292 bh_hash_shift = 0;
2293 while((tmp >>= 1UL) != 0UL)
2294 bh_hash_shift++;
2296 hash_table = (struct buffer_head **)
2297 __get_free_pages(GFP_ATOMIC, order);
2298 } while (hash_table == NULL && --order > 0);
2299 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2300 nr_hash, order, (PAGE_SIZE << order));
2302 if (!hash_table)
2303 panic("Failed to allocate buffer hash table\n");
2305 /* Setup hash chains. */
2306 for(i = 0; i < nr_hash; i++)
2307 hash_table[i] = NULL;
2309 /* Setup free lists. */
2310 for(i = 0; i < NR_SIZES; i++) {
2311 free_list[i].list = NULL;
2312 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2315 /* Setup lru lists. */
2316 for(i = 0; i < NR_LIST; i++)
2317 lru_list[i] = NULL;
2319 bh_cachep = kmem_cache_create("buffer_head",
2320 sizeof(struct buffer_head),
2322 SLAB_HWCACHE_ALIGN, NULL, NULL);
2323 if(!bh_cachep)
2324 panic("Cannot create buffer head SLAB cache\n");
2328 /* ====================== bdflush support =================== */
2330 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2331 * response to dirty buffers. Once this process is activated, we write back
2332 * a limited number of buffers to the disks and then go back to sleep again.
2334 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2335 struct task_struct *bdflush_tsk = 0;
2337 void wakeup_bdflush(int block)
2339 DECLARE_WAITQUEUE(wait, current);
2341 if (current == bdflush_tsk)
2342 return;
2344 if (!block) {
2345 wake_up_process(bdflush_tsk);
2346 return;
2349 /* kflushd can wakeup us before we have a chance to
2350 go to sleep so we must be smart in handling
2351 this wakeup event from kflushd to avoid deadlocking in SMP
2352 (we are not holding any lock anymore in these two paths). */
2353 __set_current_state(TASK_UNINTERRUPTIBLE);
2354 add_wait_queue(&bdflush_done, &wait);
2356 wake_up_process(bdflush_tsk);
2357 schedule();
2359 remove_wait_queue(&bdflush_done, &wait);
2360 __set_current_state(TASK_RUNNING);
2363 /* This is the _only_ function that deals with flushing async writes
2364 to disk.
2365 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2366 as all dirty buffers lives _only_ in the DIRTY lru list.
2367 As we never browse the LOCKED and CLEAN lru lists they are infact
2368 completly useless. */
2369 static int flush_dirty_buffers(int check_flushtime)
2371 struct buffer_head * bh, *next;
2372 int flushed = 0, i;
2374 restart:
2375 spin_lock(&lru_list_lock);
2376 bh = lru_list[BUF_DIRTY];
2377 if (!bh)
2378 goto out_unlock;
2379 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2380 next = bh->b_next_free;
2382 if (!buffer_dirty(bh)) {
2383 __refile_buffer(bh);
2384 continue;
2386 if (buffer_locked(bh))
2387 continue;
2389 if (check_flushtime) {
2390 /* The dirty lru list is chronologically ordered so
2391 if the current bh is not yet timed out,
2392 then also all the following bhs
2393 will be too young. */
2394 if (time_before(jiffies, bh->b_flushtime))
2395 goto out_unlock;
2396 } else {
2397 if (++flushed > bdf_prm.b_un.ndirty)
2398 goto out_unlock;
2401 /* OK, now we are committed to write it out. */
2402 atomic_inc(&bh->b_count);
2403 spin_unlock(&lru_list_lock);
2404 ll_rw_block(WRITE, 1, &bh);
2405 atomic_dec(&bh->b_count);
2407 if (current->need_resched)
2408 schedule();
2409 goto restart;
2411 out_unlock:
2412 spin_unlock(&lru_list_lock);
2414 return flushed;
2418 * Here we attempt to write back old buffers. We also try to flush inodes
2419 * and supers as well, since this function is essentially "update", and
2420 * otherwise there would be no way of ensuring that these quantities ever
2421 * get written back. Ideally, we would have a timestamp on the inodes
2422 * and superblocks so that we could write back only the old ones as well
2425 static int sync_old_buffers(void)
2427 lock_kernel();
2428 sync_supers(0);
2429 sync_inodes(0);
2430 unlock_kernel();
2432 flush_dirty_buffers(1);
2433 /* must really sync all the active I/O request to disk here */
2434 run_task_queue(&tq_disk);
2435 return 0;
2438 int block_sync_page(struct page *page)
2440 run_task_queue(&tq_disk);
2441 return 0;
2444 /* This is the interface to bdflush. As we get more sophisticated, we can
2445 * pass tuning parameters to this "process", to adjust how it behaves.
2446 * We would want to verify each parameter, however, to make sure that it
2447 * is reasonable. */
2449 asmlinkage long sys_bdflush(int func, long data)
2451 if (!capable(CAP_SYS_ADMIN))
2452 return -EPERM;
2454 if (func == 1) {
2455 /* do_exit directly and let kupdate to do its work alone. */
2456 do_exit(0);
2457 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2458 a syscall that doesn't care about the current mm context. */
2459 int error;
2460 struct mm_struct *user_mm;
2463 * bdflush will spend all of it's time in kernel-space,
2464 * without touching user-space, so we can switch it into
2465 * 'lazy TLB mode' to reduce the cost of context-switches
2466 * to and from bdflush.
2468 user_mm = start_lazy_tlb();
2469 error = sync_old_buffers();
2470 end_lazy_tlb(user_mm);
2471 return error;
2472 #endif
2475 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2476 if (func >= 2) {
2477 int i = (func-2) >> 1;
2478 if (i >= 0 && i < N_PARAM) {
2479 if ((func & 1) == 0)
2480 return put_user(bdf_prm.data[i], (int*)data);
2482 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2483 bdf_prm.data[i] = data;
2484 return 0;
2487 return -EINVAL;
2490 /* Having func 0 used to launch the actual bdflush and then never
2491 * return (unless explicitly killed). We return zero here to
2492 * remain semi-compatible with present update(8) programs.
2494 return 0;
2498 * This is the actual bdflush daemon itself. It used to be started from
2499 * the syscall above, but now we launch it ourselves internally with
2500 * kernel_thread(...) directly after the first thread in init/main.c
2502 int bdflush(void * unused)
2504 struct task_struct *tsk = current;
2505 int flushed;
2507 * We have a bare-bones task_struct, and really should fill
2508 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2509 * display semi-sane things. Not real crucial though...
2512 tsk->session = 1;
2513 tsk->pgrp = 1;
2514 strcpy(tsk->comm, "kflushd");
2515 bdflush_tsk = tsk;
2517 /* avoid getting signals */
2518 spin_lock_irq(&tsk->sigmask_lock);
2519 flush_signals(tsk);
2520 sigfillset(&tsk->blocked);
2521 recalc_sigpending(tsk);
2522 spin_unlock_irq(&tsk->sigmask_lock);
2524 for (;;) {
2525 CHECK_EMERGENCY_SYNC
2527 flushed = flush_dirty_buffers(0);
2529 /* If wakeup_bdflush will wakeup us
2530 after our bdflush_done wakeup, then
2531 we must make sure to not sleep
2532 in schedule_timeout otherwise
2533 wakeup_bdflush may wait for our
2534 bdflush_done wakeup that would never arrive
2535 (as we would be sleeping) and so it would
2536 deadlock in SMP. */
2537 __set_current_state(TASK_INTERRUPTIBLE);
2538 wake_up(&bdflush_done);
2540 * If there are still a lot of dirty buffers around,
2541 * skip the sleep and flush some more. Otherwise, we
2542 * go to sleep waiting a wakeup.
2544 if (!flushed || balance_dirty_state(NODEV) < 0)
2545 schedule();
2546 /* Remember to mark us as running otherwise
2547 the next schedule will block. */
2548 __set_current_state(TASK_RUNNING);
2553 * This is the kernel update daemon. It was used to live in userspace
2554 * but since it's need to run safely we want it unkillable by mistake.
2555 * You don't need to change your userspace configuration since
2556 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2558 int kupdate(void * unused)
2560 struct task_struct * tsk = current;
2561 int interval;
2563 tsk->session = 1;
2564 tsk->pgrp = 1;
2565 strcpy(tsk->comm, "kupdate");
2567 /* sigstop and sigcont will stop and wakeup kupdate */
2568 spin_lock_irq(&tsk->sigmask_lock);
2569 sigfillset(&tsk->blocked);
2570 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2571 recalc_sigpending(tsk);
2572 spin_unlock_irq(&tsk->sigmask_lock);
2574 for (;;) {
2575 /* update interval */
2576 interval = bdf_prm.b_un.interval;
2577 if (interval) {
2578 tsk->state = TASK_INTERRUPTIBLE;
2579 schedule_timeout(interval);
2580 } else {
2581 stop_kupdate:
2582 tsk->state = TASK_STOPPED;
2583 schedule(); /* wait for SIGCONT */
2585 /* check for sigstop */
2586 if (signal_pending(tsk)) {
2587 int stopped = 0;
2588 spin_lock_irq(&tsk->sigmask_lock);
2589 if (sigismember(&tsk->signal, SIGSTOP)) {
2590 sigdelset(&tsk->signal, SIGSTOP);
2591 stopped = 1;
2593 recalc_sigpending(tsk);
2594 spin_unlock_irq(&tsk->sigmask_lock);
2595 if (stopped)
2596 goto stop_kupdate;
2598 #ifdef DEBUG
2599 printk("kupdate() activated...\n");
2600 #endif
2601 sync_old_buffers();
2605 static int __init bdflush_init(void)
2607 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2608 kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2609 return 0;
2612 module_init(bdflush_init)