Optimize andes_clear_page() and andes_copy_page() with prefetch
[linux-2.6/linux-mips.git] / fs / buffer.c
blob20790847f4bcbd02b415d551f99c76c17cda5471
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/smp_lock.h>
39 #include <linux/vmalloc.h>
40 #include <linux/blkdev.h>
41 #include <linux/sysrq.h>
42 #include <linux/file.h>
43 #include <linux/init.h>
44 #include <linux/quotaops.h>
45 #include <linux/iobuf.h>
46 #include <linux/highmem.h>
48 #include <asm/uaccess.h>
49 #include <asm/io.h>
50 #include <asm/bitops.h>
51 #include <asm/mmu_context.h>
53 #define NR_SIZES 7
54 static char buffersize_index[65] =
55 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
56 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
57 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 6};
61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
65 number of unused buffer heads */
67 /* Anti-deadlock ordering:
68 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
72 * Hash table gook..
74 static unsigned int bh_hash_mask;
75 static unsigned int bh_hash_shift;
76 static struct buffer_head **hash_table;
77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
79 static struct buffer_head *lru_list[NR_LIST];
80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
81 static int nr_buffers_type[NR_LIST];
82 static unsigned long size_buffers_type[NR_LIST];
84 static struct buffer_head * unused_list;
85 static int nr_unused_buffer_heads;
86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
89 struct bh_free_head {
90 struct buffer_head *list;
91 spinlock_t lock;
93 static struct bh_free_head free_list[NR_SIZES];
95 kmem_cache_t *bh_cachep;
97 static int grow_buffers(int size);
98 static void __refile_buffer(struct buffer_head *);
100 /* This is used by some architectures to estimate available memory. */
101 atomic_t buffermem_pages = ATOMIC_INIT(0);
103 /* Here is the parameter block for the bdflush process. If you add or
104 * remove any of the parameters, make sure to update kernel/sysctl.c.
107 #define N_PARAM 9
109 /* The dummy values in this structure are left in there for compatibility
110 * with old programs that play with the /proc entries.
112 union bdflush_param {
113 struct {
114 int nfract; /* Percentage of buffer cache dirty to
115 activate bdflush */
116 int ndirty; /* Maximum number of dirty blocks to write out per
117 wake-cycle */
118 int nrefill; /* Number of clean buffers to try to obtain
119 each time we call refill */
120 int nref_dirt; /* Dirty buffer threshold for activating bdflush
121 when trying to refill buffers. */
122 int interval; /* jiffies delay between kupdate flushes */
123 int age_buffer; /* Time for normal buffer to age before we flush it */
124 int age_super; /* Time for superblock to age before we flush it */
125 int dummy2; /* unused */
126 int dummy3; /* unused */
127 } b_un;
128 unsigned int data[N_PARAM];
129 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
131 /* These are the min and max parameter values that we will allow to be assigned */
132 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
133 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
136 * Rewrote the wait-routines to use the "new" wait-queue functionality,
137 * and getting rid of the cli-sti pairs. The wait-queue routines still
138 * need cli-sti, but now it's just a couple of 386 instructions or so.
140 * Note that the real wait_on_buffer() is an inline function that checks
141 * if 'b_wait' is set before calling this, so that the queues aren't set
142 * up unnecessarily.
144 void __wait_on_buffer(struct buffer_head * bh)
146 struct task_struct *tsk = current;
147 DECLARE_WAITQUEUE(wait, tsk);
149 atomic_inc(&bh->b_count);
150 add_wait_queue(&bh->b_wait, &wait);
151 do {
152 run_task_queue(&tq_disk);
153 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
154 if (!buffer_locked(bh))
155 break;
156 schedule();
157 } while (buffer_locked(bh));
158 tsk->state = TASK_RUNNING;
159 remove_wait_queue(&bh->b_wait, &wait);
160 atomic_dec(&bh->b_count);
163 /* Call sync_buffers with wait!=0 to ensure that the call does not
164 * return until all buffer writes have completed. Sync() may return
165 * before the writes have finished; fsync() may not.
168 /* Godamity-damn. Some buffers (bitmaps for filesystems)
169 * spontaneously dirty themselves without ever brelse being called.
170 * We will ultimately want to put these in a separate list, but for
171 * now we search all of the lists for dirty buffers.
173 static int sync_buffers(kdev_t dev, int wait)
175 int i, retry, pass = 0, err = 0;
176 struct buffer_head * bh, *next;
178 /* One pass for no-wait, three for wait:
179 * 0) write out all dirty, unlocked buffers;
180 * 1) write out all dirty buffers, waiting if locked;
181 * 2) wait for completion by waiting for all buffers to unlock.
183 do {
184 retry = 0;
186 /* We search all lists as a failsafe mechanism, not because we expect
187 * there to be dirty buffers on any of the other lists.
189 repeat:
190 spin_lock(&lru_list_lock);
191 bh = lru_list[BUF_DIRTY];
192 if (!bh)
193 goto repeat2;
195 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
196 next = bh->b_next_free;
198 if (!lru_list[BUF_DIRTY])
199 break;
200 if (dev && bh->b_dev != dev)
201 continue;
202 if (buffer_locked(bh)) {
203 /* Buffer is locked; skip it unless wait is
204 * requested AND pass > 0.
206 if (!wait || !pass) {
207 retry = 1;
208 continue;
210 atomic_inc(&bh->b_count);
211 spin_unlock(&lru_list_lock);
212 wait_on_buffer (bh);
213 atomic_dec(&bh->b_count);
214 goto repeat;
217 /* If an unlocked buffer is not uptodate, there has
218 * been an IO error. Skip it.
220 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
221 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
222 err = -EIO;
223 continue;
226 /* Don't write clean buffers. Don't write ANY buffers
227 * on the third pass.
229 if (!buffer_dirty(bh) || pass >= 2)
230 continue;
232 atomic_inc(&bh->b_count);
233 spin_unlock(&lru_list_lock);
234 ll_rw_block(WRITE, 1, &bh);
235 atomic_dec(&bh->b_count);
236 retry = 1;
237 goto repeat;
240 repeat2:
241 bh = lru_list[BUF_LOCKED];
242 if (!bh) {
243 spin_unlock(&lru_list_lock);
244 break;
246 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
247 next = bh->b_next_free;
249 if (!lru_list[BUF_LOCKED])
250 break;
251 if (dev && bh->b_dev != dev)
252 continue;
253 if (buffer_locked(bh)) {
254 /* Buffer is locked; skip it unless wait is
255 * requested AND pass > 0.
257 if (!wait || !pass) {
258 retry = 1;
259 continue;
261 atomic_inc(&bh->b_count);
262 spin_unlock(&lru_list_lock);
263 wait_on_buffer (bh);
264 spin_lock(&lru_list_lock);
265 atomic_dec(&bh->b_count);
266 goto repeat2;
269 spin_unlock(&lru_list_lock);
271 /* If we are waiting for the sync to succeed, and if any dirty
272 * blocks were written, then repeat; on the second pass, only
273 * wait for buffers being written (do not pass to write any
274 * more buffers on the second pass).
276 } while (wait && retry && ++pass<=2);
277 return err;
280 void sync_dev(kdev_t dev)
282 sync_supers(dev);
283 sync_inodes(dev);
284 DQUOT_SYNC(dev);
285 /* sync all the dirty buffers out to disk only _after_ all the
286 high level layers finished generated buffer dirty data
287 (or we'll return with some buffer still dirty on the blockdevice
288 so breaking the semantics of this call) */
289 sync_buffers(dev, 0);
291 * FIXME(eric) we need to sync the physical devices here.
292 * This is because some (scsi) controllers have huge amounts of
293 * cache onboard (hundreds of Mb), and we need to instruct
294 * them to commit all of the dirty memory to disk, and we should
295 * not return until this has happened.
297 * This would need to get implemented by going through the assorted
298 * layers so that each block major number can be synced, and this
299 * would call down into the upper and mid-layer scsi.
303 int fsync_dev(kdev_t dev)
305 sync_buffers(dev, 0);
307 lock_kernel();
308 sync_supers(dev);
309 sync_inodes(dev);
310 DQUOT_SYNC(dev);
311 unlock_kernel();
313 return sync_buffers(dev, 1);
316 asmlinkage long sys_sync(void)
318 fsync_dev(0);
319 return 0;
323 * filp may be NULL if called via the msync of a vma.
326 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
328 struct inode * inode = dentry->d_inode;
329 struct super_block * sb;
330 kdev_t dev;
331 int ret;
333 lock_kernel();
334 /* sync the inode to buffers */
335 write_inode_now(inode, 0);
337 /* sync the superblock to buffers */
338 sb = inode->i_sb;
339 wait_on_super(sb);
340 if (sb->s_op && sb->s_op->write_super)
341 sb->s_op->write_super(sb);
343 /* .. finally sync the buffers to disk */
344 dev = inode->i_dev;
345 ret = sync_buffers(dev, 1);
346 unlock_kernel();
347 return ret;
350 asmlinkage long sys_fsync(unsigned int fd)
352 struct file * file;
353 struct dentry * dentry;
354 struct inode * inode;
355 int err;
357 err = -EBADF;
358 file = fget(fd);
359 if (!file)
360 goto out;
362 dentry = file->f_dentry;
363 inode = dentry->d_inode;
365 err = -EINVAL;
366 if (!file->f_op || !file->f_op->fsync)
367 goto out_putf;
369 /* We need to protect against concurrent writers.. */
370 down(&inode->i_sem);
371 err = file->f_op->fsync(file, dentry, 0);
372 up(&inode->i_sem);
374 out_putf:
375 fput(file);
376 out:
377 return err;
380 asmlinkage long sys_fdatasync(unsigned int fd)
382 struct file * file;
383 struct dentry * dentry;
384 struct inode * inode;
385 int err;
387 err = -EBADF;
388 file = fget(fd);
389 if (!file)
390 goto out;
392 dentry = file->f_dentry;
393 inode = dentry->d_inode;
395 err = -EINVAL;
396 if (!file->f_op || !file->f_op->fsync)
397 goto out_putf;
399 down(&inode->i_sem);
400 err = file->f_op->fsync(file, dentry, 1);
401 up(&inode->i_sem);
403 out_putf:
404 fput(file);
405 out:
406 return err;
409 /* After several hours of tedious analysis, the following hash
410 * function won. Do not mess with it... -DaveM
412 #define _hashfn(dev,block) \
413 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
414 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
415 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
417 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
419 if ((bh->b_next = *head) != NULL)
420 bh->b_next->b_pprev = &bh->b_next;
421 *head = bh;
422 bh->b_pprev = head;
425 static __inline__ void __hash_unlink(struct buffer_head *bh)
427 if (bh->b_pprev) {
428 if (bh->b_next)
429 bh->b_next->b_pprev = bh->b_pprev;
430 *(bh->b_pprev) = bh->b_next;
431 bh->b_pprev = NULL;
435 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
437 struct buffer_head **bhp = &lru_list[blist];
439 if(!*bhp) {
440 *bhp = bh;
441 bh->b_prev_free = bh;
443 bh->b_next_free = *bhp;
444 bh->b_prev_free = (*bhp)->b_prev_free;
445 (*bhp)->b_prev_free->b_next_free = bh;
446 (*bhp)->b_prev_free = bh;
447 nr_buffers_type[blist]++;
448 size_buffers_type[blist] += bh->b_size;
451 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
453 if (bh->b_prev_free || bh->b_next_free) {
454 bh->b_prev_free->b_next_free = bh->b_next_free;
455 bh->b_next_free->b_prev_free = bh->b_prev_free;
456 if (lru_list[blist] == bh)
457 lru_list[blist] = bh->b_next_free;
458 if (lru_list[blist] == bh)
459 lru_list[blist] = NULL;
460 bh->b_next_free = bh->b_prev_free = NULL;
461 nr_buffers_type[blist]--;
462 size_buffers_type[blist] -= bh->b_size;
466 static void __remove_from_free_list(struct buffer_head * bh, int index)
468 if(bh->b_next_free == bh)
469 free_list[index].list = NULL;
470 else {
471 bh->b_prev_free->b_next_free = bh->b_next_free;
472 bh->b_next_free->b_prev_free = bh->b_prev_free;
473 if (free_list[index].list == bh)
474 free_list[index].list = bh->b_next_free;
476 bh->b_next_free = bh->b_prev_free = NULL;
479 /* must be called with both the hash_table_lock and the lru_list_lock
480 held */
481 static void __remove_from_queues(struct buffer_head *bh)
483 __hash_unlink(bh);
484 __remove_from_lru_list(bh, bh->b_list);
487 static void insert_into_queues(struct buffer_head *bh)
489 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
491 spin_lock(&lru_list_lock);
492 write_lock(&hash_table_lock);
493 __hash_link(bh, head);
494 __insert_into_lru_list(bh, bh->b_list);
495 write_unlock(&hash_table_lock);
496 spin_unlock(&lru_list_lock);
499 /* This function must only run if there are no other
500 * references _anywhere_ to this buffer head.
502 static void put_last_free(struct buffer_head * bh)
504 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
505 struct buffer_head **bhp = &head->list;
507 bh->b_state = 0;
509 spin_lock(&head->lock);
510 bh->b_dev = B_FREE;
511 if(!*bhp) {
512 *bhp = bh;
513 bh->b_prev_free = bh;
515 bh->b_next_free = *bhp;
516 bh->b_prev_free = (*bhp)->b_prev_free;
517 (*bhp)->b_prev_free->b_next_free = bh;
518 (*bhp)->b_prev_free = bh;
519 spin_unlock(&head->lock);
523 * Why like this, I hear you say... The reason is race-conditions.
524 * As we don't lock buffers (unless we are reading them, that is),
525 * something might happen to it while we sleep (ie a read-error
526 * will force it bad). This shouldn't really happen currently, but
527 * the code is ready.
529 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
531 struct buffer_head **head = &hash(dev, block);
532 struct buffer_head *bh;
534 read_lock(&hash_table_lock);
535 for(bh = *head; bh; bh = bh->b_next)
536 if (bh->b_blocknr == block &&
537 bh->b_size == size &&
538 bh->b_dev == dev)
539 break;
540 if (bh)
541 atomic_inc(&bh->b_count);
542 read_unlock(&hash_table_lock);
544 return bh;
547 unsigned int get_hardblocksize(kdev_t dev)
550 * Get the hard sector size for the given device. If we don't know
551 * what it is, return 0.
553 if (hardsect_size[MAJOR(dev)] != NULL) {
554 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
555 if (blksize != 0)
556 return blksize;
560 * We don't know what the hardware sector size for this device is.
561 * Return 0 indicating that we don't know.
563 return 0;
566 /* If invalidate_buffers() will trash dirty buffers, it means some kind
567 of fs corruption is going on. Trashing dirty data always imply losing
568 information that was supposed to be just stored on the physical layer
569 by the user.
571 Thus invalidate_buffers in general usage is not allwowed to trash dirty
572 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
574 NOTE: In the case where the user removed a removable-media-disk even if
575 there's still dirty data not synced on disk (due a bug in the device driver
576 or due an error of the user), by not destroying the dirty buffers we could
577 generate corruption also on the next media inserted, thus a parameter is
578 necessary to handle this case in the most safe way possible (trying
579 to not corrupt also the new disk inserted with the data belonging to
580 the old now corrupted disk). Also for the ramdisk the natural thing
581 to do in order to release the ramdisk memory is to destroy dirty buffers.
583 These are two special cases. Normal usage imply the device driver
584 to issue a sync on the device (without waiting I/O completation) and
585 then an invalidate_buffers call that doesn't trashes dirty buffers. */
586 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
588 int i, nlist, slept;
589 struct buffer_head * bh, * bh_next;
591 retry:
592 slept = 0;
593 spin_lock(&lru_list_lock);
594 for(nlist = 0; nlist < NR_LIST; nlist++) {
595 bh = lru_list[nlist];
596 if (!bh)
597 continue;
598 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
599 bh_next = bh->b_next_free;
600 if (bh->b_dev != dev)
601 continue;
602 if (buffer_locked(bh)) {
603 atomic_inc(&bh->b_count);
604 spin_unlock(&lru_list_lock);
605 wait_on_buffer(bh);
606 slept = 1;
607 spin_lock(&lru_list_lock);
608 atomic_dec(&bh->b_count);
611 write_lock(&hash_table_lock);
612 if (!atomic_read(&bh->b_count) &&
613 (destroy_dirty_buffers || !buffer_dirty(bh))) {
614 __remove_from_queues(bh);
615 put_last_free(bh);
617 write_unlock(&hash_table_lock);
618 if (slept)
619 goto out;
622 out:
623 spin_unlock(&lru_list_lock);
624 if (slept)
625 goto retry;
628 void set_blocksize(kdev_t dev, int size)
630 extern int *blksize_size[];
631 int i, nlist, slept;
632 struct buffer_head * bh, * bh_next;
634 if (!blksize_size[MAJOR(dev)])
635 return;
637 /* Size must be a power of two, and between 512 and PAGE_SIZE */
638 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
639 panic("Invalid blocksize passed to set_blocksize");
641 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
642 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
643 return;
645 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
646 return;
647 sync_buffers(dev, 2);
648 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
650 retry:
651 slept = 0;
652 spin_lock(&lru_list_lock);
653 for(nlist = 0; nlist < NR_LIST; nlist++) {
654 bh = lru_list[nlist];
655 if (!bh)
656 continue;
657 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
658 bh_next = bh->b_next_free;
659 if (bh->b_dev != dev || bh->b_size == size)
660 continue;
661 if (buffer_locked(bh)) {
662 atomic_inc(&bh->b_count);
663 spin_unlock(&lru_list_lock);
664 wait_on_buffer(bh);
665 slept = 1;
666 spin_lock(&lru_list_lock);
667 atomic_dec(&bh->b_count);
670 write_lock(&hash_table_lock);
671 if (!atomic_read(&bh->b_count)) {
672 if (buffer_dirty(bh))
673 printk(KERN_WARNING
674 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
675 kdevname(dev), bh->b_blocknr, bh->b_size);
676 __remove_from_queues(bh);
677 put_last_free(bh);
678 } else {
679 if (atomic_set_buffer_clean(bh))
680 __refile_buffer(bh);
681 clear_bit(BH_Uptodate, &bh->b_state);
682 printk(KERN_WARNING
683 "set_blocksize: "
684 "b_count %d, dev %s, block %lu, from %p\n",
685 atomic_read(&bh->b_count), bdevname(bh->b_dev),
686 bh->b_blocknr, __builtin_return_address(0));
688 write_unlock(&hash_table_lock);
689 if (slept)
690 goto out;
693 out:
694 spin_unlock(&lru_list_lock);
695 if (slept)
696 goto retry;
700 * We used to try various strange things. Let's not.
702 static void refill_freelist(int size)
704 if (!grow_buffers(size)) {
705 wakeup_bdflush(1);
706 current->policy |= SCHED_YIELD;
707 schedule();
711 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
713 bh->b_list = BUF_CLEAN;
714 bh->b_end_io = handler;
715 bh->b_dev_id = dev_id;
718 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
720 mark_buffer_uptodate(bh, uptodate);
721 unlock_buffer(bh);
724 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
726 mark_buffer_uptodate(bh, uptodate);
727 unlock_buffer(bh);
728 BUG();
731 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
733 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
734 unsigned long flags;
735 struct buffer_head *tmp;
736 struct page *page;
738 mark_buffer_uptodate(bh, uptodate);
740 /* This is a temporary buffer used for page I/O. */
741 page = bh->b_page;
743 if (!uptodate)
744 SetPageError(page);
747 * Be _very_ careful from here on. Bad things can happen if
748 * two buffer heads end IO at almost the same time and both
749 * decide that the page is now completely done.
751 * Async buffer_heads are here only as labels for IO, and get
752 * thrown away once the IO for this page is complete. IO is
753 * deemed complete once all buffers have been visited
754 * (b_count==0) and are now unlocked. We must make sure that
755 * only the _last_ buffer that decrements its count is the one
756 * that unlock the page..
758 spin_lock_irqsave(&page_uptodate_lock, flags);
759 unlock_buffer(bh);
760 atomic_dec(&bh->b_count);
761 tmp = bh->b_this_page;
762 while (tmp != bh) {
763 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
764 goto still_busy;
765 tmp = tmp->b_this_page;
768 /* OK, the async IO on this page is complete. */
769 spin_unlock_irqrestore(&page_uptodate_lock, flags);
772 * if none of the buffers had errors then we can set the
773 * page uptodate:
775 if (!PageError(page))
776 SetPageUptodate(page);
779 * Run the hooks that have to be done when a page I/O has completed.
781 if (PageTestandClearDecrAfter(page))
782 atomic_dec(&nr_async_pages);
784 UnlockPage(page);
786 return;
788 still_busy:
789 spin_unlock_irqrestore(&page_uptodate_lock, flags);
790 return;
794 * Ok, this is getblk, and it isn't very clear, again to hinder
795 * race-conditions. Most of the code is seldom used, (ie repeating),
796 * so it should be much more efficient than it looks.
798 * The algorithm is changed: hopefully better, and an elusive bug removed.
800 * 14.02.92: changed it to sync dirty buffers a bit: better performance
801 * when the filesystem starts to get full of dirty blocks (I hope).
803 struct buffer_head * getblk(kdev_t dev, int block, int size)
805 struct buffer_head * bh;
806 int isize;
808 repeat:
809 bh = get_hash_table(dev, block, size);
810 if (bh)
811 goto out;
813 isize = BUFSIZE_INDEX(size);
814 spin_lock(&free_list[isize].lock);
815 bh = free_list[isize].list;
816 if (bh) {
817 __remove_from_free_list(bh, isize);
818 atomic_set(&bh->b_count, 1);
820 spin_unlock(&free_list[isize].lock);
823 * OK, FINALLY we know that this buffer is the only one of
824 * its kind, we hold a reference (b_count>0), it is unlocked,
825 * and it is clean.
827 if (bh) {
828 init_buffer(bh, end_buffer_io_sync, NULL);
829 bh->b_dev = dev;
830 bh->b_blocknr = block;
831 bh->b_state = 1 << BH_Mapped;
833 /* Insert the buffer into the regular lists */
834 insert_into_queues(bh);
835 out:
836 touch_buffer(bh);
837 return bh;
841 * If we block while refilling the free list, somebody may
842 * create the buffer first ... search the hashes again.
844 refill_freelist(size);
845 goto repeat;
848 /* -1 -> no need to flush
849 0 -> async flush
850 1 -> sync flush (wait for I/O completation) */
851 static int balance_dirty_state(kdev_t dev)
853 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
855 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
856 tot = nr_free_buffer_pages();
857 tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
859 dirty *= 200;
860 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
861 hard_dirty_limit = soft_dirty_limit * 2;
863 if (dirty > soft_dirty_limit) {
864 if (dirty > hard_dirty_limit)
865 return 1;
866 return 0;
868 return -1;
872 * if a new dirty buffer is created we need to balance bdflush.
874 * in the future we might want to make bdflush aware of different
875 * pressures on different devices - thus the (currently unused)
876 * 'dev' parameter.
878 void balance_dirty(kdev_t dev)
880 int state = balance_dirty_state(dev);
882 if (state < 0)
883 return;
884 wakeup_bdflush(state);
887 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
889 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
890 refile_buffer(bh);
893 /* atomic version, the user must call balance_dirty() by hand
894 as soon as it become possible to block */
895 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
897 if (!atomic_set_buffer_dirty(bh))
898 __mark_dirty(bh, flag);
901 void mark_buffer_dirty(struct buffer_head *bh, int flag)
903 __mark_buffer_dirty(bh, flag);
904 balance_dirty(bh->b_dev);
908 * A buffer may need to be moved from one buffer list to another
909 * (e.g. in case it is not shared any more). Handle this.
911 static void __refile_buffer(struct buffer_head *bh)
913 int dispose = BUF_CLEAN;
914 if (buffer_locked(bh))
915 dispose = BUF_LOCKED;
916 if (buffer_dirty(bh))
917 dispose = BUF_DIRTY;
918 if (buffer_protected(bh))
919 dispose = BUF_PROTECTED;
920 if (dispose != bh->b_list) {
921 __remove_from_lru_list(bh, bh->b_list);
922 bh->b_list = dispose;
923 __insert_into_lru_list(bh, dispose);
927 void refile_buffer(struct buffer_head *bh)
929 spin_lock(&lru_list_lock);
930 __refile_buffer(bh);
931 spin_unlock(&lru_list_lock);
935 * Release a buffer head
937 void __brelse(struct buffer_head * buf)
939 if (atomic_read(&buf->b_count)) {
940 atomic_dec(&buf->b_count);
941 return;
943 printk("VFS: brelse: Trying to free free buffer\n");
947 * bforget() is like brelse(), except it puts the buffer on the
948 * free list if it can.. We can NOT free the buffer if:
949 * - there are other users of it
950 * - it is locked and thus can have active IO
952 void __bforget(struct buffer_head * buf)
954 /* grab the lru lock here to block bdflush. */
955 spin_lock(&lru_list_lock);
956 write_lock(&hash_table_lock);
957 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
958 goto in_use;
959 __hash_unlink(buf);
960 write_unlock(&hash_table_lock);
961 __remove_from_lru_list(buf, buf->b_list);
962 spin_unlock(&lru_list_lock);
963 put_last_free(buf);
964 return;
966 in_use:
967 write_unlock(&hash_table_lock);
968 spin_unlock(&lru_list_lock);
972 * bread() reads a specified block and returns the buffer that contains
973 * it. It returns NULL if the block was unreadable.
975 struct buffer_head * bread(kdev_t dev, int block, int size)
977 struct buffer_head * bh;
979 bh = getblk(dev, block, size);
980 if (buffer_uptodate(bh))
981 return bh;
982 ll_rw_block(READ, 1, &bh);
983 wait_on_buffer(bh);
984 if (buffer_uptodate(bh))
985 return bh;
986 brelse(bh);
987 return NULL;
991 * Ok, breada can be used as bread, but additionally to mark other
992 * blocks for reading as well. End the argument list with a negative
993 * number.
996 #define NBUF 16
998 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
999 unsigned int pos, unsigned int filesize)
1001 struct buffer_head * bhlist[NBUF];
1002 unsigned int blocks;
1003 struct buffer_head * bh;
1004 int index;
1005 int i, j;
1007 if (pos >= filesize)
1008 return NULL;
1010 if (block < 0)
1011 return NULL;
1013 bh = getblk(dev, block, bufsize);
1014 index = BUFSIZE_INDEX(bh->b_size);
1016 if (buffer_uptodate(bh))
1017 return(bh);
1018 else ll_rw_block(READ, 1, &bh);
1020 blocks = (filesize - pos) >> (9+index);
1022 if (blocks < (read_ahead[MAJOR(dev)] >> index))
1023 blocks = read_ahead[MAJOR(dev)] >> index;
1024 if (blocks > NBUF)
1025 blocks = NBUF;
1027 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
1029 bhlist[0] = bh;
1030 j = 1;
1031 for(i=1; i<blocks; i++) {
1032 bh = getblk(dev,block+i,bufsize);
1033 if (buffer_uptodate(bh)) {
1034 brelse(bh);
1035 break;
1037 else bhlist[j++] = bh;
1040 /* Request the read for these buffers, and then release them. */
1041 if (j>1)
1042 ll_rw_block(READA, (j-1), bhlist+1);
1043 for(i=1; i<j; i++)
1044 brelse(bhlist[i]);
1046 /* Wait for this buffer, and then continue on. */
1047 bh = bhlist[0];
1048 wait_on_buffer(bh);
1049 if (buffer_uptodate(bh))
1050 return bh;
1051 brelse(bh);
1052 return NULL;
1056 * Note: the caller should wake up the buffer_wait list if needed.
1058 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1060 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1061 kmem_cache_free(bh_cachep, bh);
1062 } else {
1063 bh->b_blocknr = -1;
1064 init_waitqueue_head(&bh->b_wait);
1065 nr_unused_buffer_heads++;
1066 bh->b_next_free = unused_list;
1067 bh->b_this_page = NULL;
1068 unused_list = bh;
1073 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1074 * no-buffer-head deadlock. Return NULL on failure; waiting for
1075 * buffer heads is now handled in create_buffers().
1077 static struct buffer_head * get_unused_buffer_head(int async)
1079 struct buffer_head * bh;
1081 spin_lock(&unused_list_lock);
1082 if (nr_unused_buffer_heads > NR_RESERVED) {
1083 bh = unused_list;
1084 unused_list = bh->b_next_free;
1085 nr_unused_buffer_heads--;
1086 spin_unlock(&unused_list_lock);
1087 return bh;
1089 spin_unlock(&unused_list_lock);
1091 /* This is critical. We can't swap out pages to get
1092 * more buffer heads, because the swap-out may need
1093 * more buffer-heads itself. Thus SLAB_BUFFER.
1095 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1096 memset(bh, 0, sizeof(*bh));
1097 init_waitqueue_head(&bh->b_wait);
1098 return bh;
1102 * If we need an async buffer, use the reserved buffer heads.
1104 if (async) {
1105 spin_lock(&unused_list_lock);
1106 if (unused_list) {
1107 bh = unused_list;
1108 unused_list = bh->b_next_free;
1109 nr_unused_buffer_heads--;
1110 spin_unlock(&unused_list_lock);
1111 return bh;
1113 spin_unlock(&unused_list_lock);
1115 #if 0
1117 * (Pending further analysis ...)
1118 * Ordinary (non-async) requests can use a different memory priority
1119 * to free up pages. Any swapping thus generated will use async
1120 * buffer heads.
1122 if(!async &&
1123 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1124 memset(bh, 0, sizeof(*bh));
1125 init_waitqueue_head(&bh->b_wait);
1126 return bh;
1128 #endif
1130 return NULL;
1133 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1135 bh->b_page = page;
1136 if (offset >= PAGE_SIZE)
1137 BUG();
1138 if (PageHighMem(page))
1140 * This catches illegal uses and preserves the offset:
1142 bh->b_data = (char *)(0 + offset);
1143 else
1144 bh->b_data = (char *)(page_address(page) + offset);
1148 * Create the appropriate buffers when given a page for data area and
1149 * the size of each buffer.. Use the bh->b_this_page linked list to
1150 * follow the buffers created. Return NULL if unable to create more
1151 * buffers.
1152 * The async flag is used to differentiate async IO (paging, swapping)
1153 * from ordinary buffer allocations, and only async requests are allowed
1154 * to sleep waiting for buffer heads.
1156 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1158 struct buffer_head *bh, *head;
1159 long offset;
1161 try_again:
1162 head = NULL;
1163 offset = PAGE_SIZE;
1164 while ((offset -= size) >= 0) {
1165 bh = get_unused_buffer_head(async);
1166 if (!bh)
1167 goto no_grow;
1169 bh->b_dev = B_FREE; /* Flag as unused */
1170 bh->b_this_page = head;
1171 head = bh;
1173 bh->b_state = 0;
1174 bh->b_next_free = NULL;
1175 bh->b_pprev = NULL;
1176 atomic_set(&bh->b_count, 0);
1177 bh->b_size = size;
1179 set_bh_page(bh, page, offset);
1181 bh->b_list = BUF_CLEAN;
1182 bh->b_end_io = end_buffer_io_bad;
1184 return head;
1186 * In case anything failed, we just free everything we got.
1188 no_grow:
1189 if (head) {
1190 spin_lock(&unused_list_lock);
1191 do {
1192 bh = head;
1193 head = head->b_this_page;
1194 __put_unused_buffer_head(bh);
1195 } while (head);
1196 spin_unlock(&unused_list_lock);
1198 /* Wake up any waiters ... */
1199 wake_up(&buffer_wait);
1203 * Return failure for non-async IO requests. Async IO requests
1204 * are not allowed to fail, so we have to wait until buffer heads
1205 * become available. But we don't want tasks sleeping with
1206 * partially complete buffers, so all were released above.
1208 if (!async)
1209 return NULL;
1211 /* We're _really_ low on memory. Now we just
1212 * wait for old buffer heads to become free due to
1213 * finishing IO. Since this is an async request and
1214 * the reserve list is empty, we're sure there are
1215 * async buffer heads in use.
1217 run_task_queue(&tq_disk);
1220 * Set our state for sleeping, then check again for buffer heads.
1221 * This ensures we won't miss a wake_up from an interrupt.
1223 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1224 goto try_again;
1227 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1229 struct buffer_head *head, *bh, *tail;
1230 int block;
1232 if (!PageLocked(page))
1233 BUG();
1235 * Allocate async buffer heads pointing to this page, just for I/O.
1236 * They don't show up in the buffer hash table, but they *are*
1237 * registered in page->buffers.
1239 head = create_buffers(page, size, 1);
1240 if (page->buffers)
1241 BUG();
1242 if (!head)
1243 BUG();
1244 tail = head;
1245 for (bh = head; bh; bh = bh->b_this_page) {
1246 block = *(b++);
1248 tail = bh;
1249 init_buffer(bh, end_buffer_io_async, NULL);
1250 bh->b_dev = dev;
1251 bh->b_blocknr = block;
1253 set_bit(BH_Mapped, &bh->b_state);
1255 tail->b_this_page = head;
1256 page_cache_get(page);
1257 page->buffers = head;
1258 return 0;
1261 static void unmap_buffer(struct buffer_head * bh)
1263 if (buffer_mapped(bh)) {
1264 mark_buffer_clean(bh);
1265 wait_on_buffer(bh);
1266 clear_bit(BH_Uptodate, &bh->b_state);
1267 clear_bit(BH_Mapped, &bh->b_state);
1268 clear_bit(BH_Req, &bh->b_state);
1269 clear_bit(BH_New, &bh->b_state);
1274 * We don't have to release all buffers here, but
1275 * we have to be sure that no dirty buffer is left
1276 * and no IO is going on (no buffer is locked), because
1277 * we have truncated the file and are going to free the
1278 * blocks on-disk..
1280 int block_flushpage(struct page *page, unsigned long offset)
1282 struct buffer_head *head, *bh, *next;
1283 unsigned int curr_off = 0;
1285 if (!PageLocked(page))
1286 BUG();
1287 if (!page->buffers)
1288 return 1;
1290 head = page->buffers;
1291 bh = head;
1292 do {
1293 unsigned int next_off = curr_off + bh->b_size;
1294 next = bh->b_this_page;
1297 * is this block fully flushed?
1299 if (offset <= curr_off)
1300 unmap_buffer(bh);
1301 curr_off = next_off;
1302 bh = next;
1303 } while (bh != head);
1306 * subtle. We release buffer-heads only if this is
1307 * the 'final' flushpage. We have invalidated the get_block
1308 * cached value unconditionally, so real IO is not
1309 * possible anymore.
1311 * If the free doesn't work out, the buffers can be
1312 * left around - they just turn into anonymous buffers
1313 * instead.
1315 if (!offset) {
1316 if (!try_to_free_buffers(page, 0)) {
1317 atomic_inc(&buffermem_pages);
1318 return 0;
1322 return 1;
1325 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1327 struct buffer_head *bh, *head, *tail;
1329 head = create_buffers(page, blocksize, 1);
1330 if (page->buffers)
1331 BUG();
1333 bh = head;
1334 do {
1335 bh->b_dev = inode->i_dev;
1336 bh->b_blocknr = 0;
1337 bh->b_end_io = end_buffer_io_bad;
1338 tail = bh;
1339 bh = bh->b_this_page;
1340 } while (bh);
1341 tail->b_this_page = head;
1342 page->buffers = head;
1343 page_cache_get(page);
1346 static void unmap_underlying_metadata(struct buffer_head * bh)
1348 struct buffer_head *old_bh;
1350 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1351 if (old_bh) {
1352 unmap_buffer(old_bh);
1353 /* Here we could run brelse or bforget. We use
1354 bforget because it will try to put the buffer
1355 in the freelist. */
1356 __bforget(old_bh);
1361 * block_write_full_page() is SMP-safe - currently it's still
1362 * being called with the kernel lock held, but the code is ready.
1364 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1366 int err, i, need_balance_dirty = 0;
1367 unsigned long block;
1368 struct buffer_head *bh, *head;
1370 if (!PageLocked(page))
1371 BUG();
1373 if (!page->buffers)
1374 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1375 head = page->buffers;
1377 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1379 bh = head;
1380 i = 0;
1381 do {
1383 * If the buffer isn't up-to-date, we can't be sure
1384 * that the buffer has been initialized with the proper
1385 * block number information etc..
1387 * Leave it to the low-level FS to make all those
1388 * decisions (block #0 may actually be a valid block)
1390 bh->b_end_io = end_buffer_io_sync;
1391 if (!buffer_mapped(bh)) {
1392 err = get_block(inode, block, bh, 1);
1393 if (err)
1394 goto out;
1395 if (buffer_new(bh))
1396 unmap_underlying_metadata(bh);
1398 set_bit(BH_Uptodate, &bh->b_state);
1399 if (!atomic_set_buffer_dirty(bh)) {
1400 __mark_dirty(bh, 0);
1401 need_balance_dirty = 1;
1404 bh = bh->b_this_page;
1405 block++;
1406 } while (bh != head);
1408 if (need_balance_dirty)
1409 balance_dirty(bh->b_dev);
1411 SetPageUptodate(page);
1412 return 0;
1413 out:
1414 ClearPageUptodate(page);
1415 return err;
1418 static int __block_prepare_write(struct inode *inode, struct page *page,
1419 unsigned from, unsigned to, get_block_t *get_block)
1421 unsigned block_start, block_end;
1422 unsigned long block;
1423 int err = 0;
1424 unsigned blocksize, bbits;
1425 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1426 char *kaddr = (char *)kmap(page);
1428 blocksize = inode->i_sb->s_blocksize;
1429 if (!page->buffers)
1430 create_empty_buffers(page, inode, blocksize);
1431 head = page->buffers;
1433 bbits = inode->i_sb->s_blocksize_bits;
1434 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1436 for(bh = head, block_start = 0; bh != head || !block_start;
1437 block++, block_start=block_end, bh = bh->b_this_page) {
1438 if (!bh)
1439 BUG();
1440 block_end = block_start+blocksize;
1441 if (block_end <= from)
1442 continue;
1443 if (block_start >= to)
1444 break;
1445 bh->b_end_io = end_buffer_io_sync;
1446 if (!buffer_mapped(bh)) {
1447 err = get_block(inode, block, bh, 1);
1448 if (err)
1449 goto out;
1450 if (buffer_new(bh)) {
1451 unmap_underlying_metadata(bh);
1452 if (block_end > to)
1453 memset(kaddr+to, 0, block_end-to);
1454 if (block_start < from)
1455 memset(kaddr+block_start, 0, from-block_start);
1456 continue;
1459 if (!buffer_uptodate(bh) &&
1460 (block_start < from || block_end > to)) {
1461 ll_rw_block(READ, 1, &bh);
1462 *wait_bh++=bh;
1466 * If we issued read requests - let them complete.
1468 while(wait_bh > wait) {
1469 wait_on_buffer(*--wait_bh);
1470 err = -EIO;
1471 if (!buffer_uptodate(*wait_bh))
1472 goto out;
1474 return 0;
1475 out:
1476 return err;
1479 static int __block_commit_write(struct inode *inode, struct page *page,
1480 unsigned from, unsigned to)
1482 unsigned block_start, block_end;
1483 int partial = 0, need_balance_dirty = 0;
1484 unsigned blocksize;
1485 struct buffer_head *bh, *head;
1487 blocksize = inode->i_sb->s_blocksize;
1489 for(bh = head = page->buffers, block_start = 0;
1490 bh != head || !block_start;
1491 block_start=block_end, bh = bh->b_this_page) {
1492 block_end = block_start + blocksize;
1493 if (block_end <= from || block_start >= to) {
1494 if (!buffer_uptodate(bh))
1495 partial = 1;
1496 } else {
1497 set_bit(BH_Uptodate, &bh->b_state);
1498 if (!atomic_set_buffer_dirty(bh)) {
1499 __mark_dirty(bh, 0);
1500 need_balance_dirty = 1;
1505 if (need_balance_dirty)
1506 balance_dirty(bh->b_dev);
1508 * is this a partial write that happened to make all buffers
1509 * uptodate then we can optimize away a bogus readpage() for
1510 * the next read(). Here we 'discover' wether the page went
1511 * uptodate as a result of this (potentially partial) write.
1513 if (!partial)
1514 SetPageUptodate(page);
1515 return 0;
1519 * Generic "read page" function for block devices that have the normal
1520 * get_block functionality. This is most of the block device filesystems.
1521 * Reads the page asynchronously --- the unlock_buffer() and
1522 * mark_buffer_uptodate() functions propagate buffer state into the
1523 * page struct once IO has completed.
1525 int block_read_full_page(struct page *page, get_block_t *get_block)
1527 struct inode *inode = (struct inode*)page->mapping->host;
1528 unsigned long iblock, lblock;
1529 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1530 unsigned int blocksize, blocks;
1531 unsigned long kaddr = 0;
1532 int nr, i;
1534 if (!PageLocked(page))
1535 PAGE_BUG(page);
1536 blocksize = inode->i_sb->s_blocksize;
1537 if (!page->buffers)
1538 create_empty_buffers(page, inode, blocksize);
1539 head = page->buffers;
1541 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1542 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1543 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1544 bh = head;
1545 nr = 0;
1546 i = 0;
1548 do {
1549 if (buffer_uptodate(bh))
1550 continue;
1552 if (!buffer_mapped(bh)) {
1553 if (iblock < lblock)
1554 get_block(inode, iblock, bh, 0);
1555 if (!buffer_mapped(bh)) {
1556 if (!kaddr)
1557 kaddr = kmap(page);
1558 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1559 set_bit(BH_Uptodate, &bh->b_state);
1560 continue;
1564 init_buffer(bh, end_buffer_io_async, NULL);
1565 atomic_inc(&bh->b_count);
1566 arr[nr] = bh;
1567 nr++;
1568 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1570 if (nr) {
1571 if (Page_Uptodate(page))
1572 BUG();
1573 ll_rw_block(READ, nr, arr);
1574 } else {
1576 * all buffers are uptodate - we can set the page
1577 * uptodate as well.
1579 SetPageUptodate(page);
1580 UnlockPage(page);
1582 if (kaddr)
1583 kunmap(page);
1584 return 0;
1588 * For moronic filesystems that do not allow holes in file.
1589 * We may have to extend the file.
1592 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1594 struct address_space *mapping = page->mapping;
1595 struct inode *inode = (struct inode*)mapping->host;
1596 struct page *new_page;
1597 unsigned long pgpos;
1598 long status;
1599 unsigned zerofrom;
1600 unsigned blocksize = inode->i_sb->s_blocksize;
1601 char *kaddr;
1603 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1604 status = -ENOMEM;
1605 new_page = grab_cache_page(mapping, pgpos);
1606 if (!new_page)
1607 goto out;
1608 /* we might sleep */
1609 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1610 UnlockPage(new_page);
1611 page_cache_release(new_page);
1612 continue;
1614 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1615 if (zerofrom & (blocksize-1)) {
1616 *bytes |= (blocksize-1);
1617 (*bytes)++;
1619 status = __block_prepare_write(inode, new_page, zerofrom,
1620 PAGE_CACHE_SIZE, get_block);
1621 if (status)
1622 goto out_unmap;
1623 kaddr = (char*)page_address(new_page);
1624 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1625 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1626 kunmap(new_page);
1627 UnlockPage(new_page);
1628 page_cache_release(new_page);
1631 if (page->index < pgpos) {
1632 /* completely inside the area */
1633 zerofrom = offset;
1634 } else {
1635 /* page covers the boundary, find the boundary offset */
1636 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1638 /* if we will expand the thing last block will be filled */
1639 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1640 *bytes |= (blocksize-1);
1641 (*bytes)++;
1644 /* starting below the boundary? Nothing to zero out */
1645 if (offset <= zerofrom)
1646 zerofrom = offset;
1648 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1649 if (status)
1650 goto out1;
1651 kaddr = (char*)page_address(page);
1652 if (zerofrom < offset) {
1653 memset(kaddr+zerofrom, 0, offset-zerofrom);
1654 __block_commit_write(inode, page, zerofrom, offset);
1656 return 0;
1657 out1:
1658 ClearPageUptodate(page);
1659 kunmap(page);
1660 return status;
1662 out_unmap:
1663 ClearPageUptodate(new_page);
1664 kunmap(new_page);
1665 UnlockPage(new_page);
1666 page_cache_release(new_page);
1667 out:
1668 return status;
1671 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1672 get_block_t *get_block)
1674 struct inode *inode = (struct inode*)page->mapping->host;
1675 int err = __block_prepare_write(inode, page, from, to, get_block);
1676 if (err) {
1677 ClearPageUptodate(page);
1678 kunmap(page);
1680 return err;
1683 int generic_commit_write(struct file *file, struct page *page,
1684 unsigned from, unsigned to)
1686 struct inode *inode = (struct inode*)page->mapping->host;
1687 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1688 __block_commit_write(inode,page,from,to);
1689 kunmap(page);
1690 if (pos > inode->i_size)
1691 inode->i_size = pos;
1692 return 0;
1695 int block_write_full_page(struct page *page, get_block_t *get_block)
1697 struct inode *inode = (struct inode*)page->mapping->host;
1698 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1699 unsigned offset;
1700 int err;
1702 /* easy case */
1703 if (page->index < end_index)
1704 return __block_write_full_page(inode, page, get_block);
1706 /* things got complicated... */
1707 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1708 /* OK, are we completely out? */
1709 if (page->index >= end_index+1 || !offset)
1710 return -EIO;
1711 /* Sigh... will have to work, then... */
1712 err = __block_prepare_write(inode, page, 0, offset, get_block);
1713 if (!err) {
1714 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1715 __block_commit_write(inode,page,0,offset);
1716 done:
1717 kunmap(page);
1718 return err;
1720 ClearPageUptodate(page);
1721 goto done;
1724 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1726 struct buffer_head tmp;
1727 struct inode *inode = (struct inode*)mapping->host;
1728 tmp.b_state = 0;
1729 tmp.b_blocknr = 0;
1730 get_block(inode, block, &tmp, 0);
1731 return tmp.b_blocknr;
1735 * IO completion routine for a buffer_head being used for kiobuf IO: we
1736 * can't dispatch the kiobuf callback until io_count reaches 0.
1739 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1741 struct kiobuf *kiobuf;
1743 mark_buffer_uptodate(bh, uptodate);
1745 kiobuf = bh->b_kiobuf;
1746 unlock_buffer(bh);
1747 end_kio_request(kiobuf, uptodate);
1752 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1753 * for them to complete. Clean up the buffer_heads afterwards.
1756 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1758 int iosize;
1759 int i;
1760 struct buffer_head *tmp;
1762 if (rw == WRITE)
1763 rw = WRITERAW;
1764 ll_rw_block(rw, nr, bh);
1766 iosize = 0;
1767 spin_lock(&unused_list_lock);
1769 for (i = nr; --i >= 0; ) {
1770 iosize += size;
1771 tmp = bh[i];
1772 if (buffer_locked(tmp)) {
1773 spin_unlock(&unused_list_lock);
1774 wait_on_buffer(tmp);
1775 spin_lock(&unused_list_lock);
1778 if (!buffer_uptodate(tmp)) {
1779 /* We are traversing bh'es in reverse order so
1780 clearing iosize on error calculates the
1781 amount of IO before the first error. */
1782 iosize = 0;
1784 __put_unused_buffer_head(tmp);
1787 spin_unlock(&unused_list_lock);
1789 return iosize;
1793 * Start I/O on a physical range of kernel memory, defined by a vector
1794 * of kiobuf structs (much like a user-space iovec list).
1796 * The kiobuf must already be locked for IO. IO is submitted
1797 * asynchronously: you need to check page->locked, page->uptodate, and
1798 * maybe wait on page->wait.
1800 * It is up to the caller to make sure that there are enough blocks
1801 * passed in to completely map the iobufs to disk.
1804 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1805 kdev_t dev, unsigned long b[], int size)
1807 int err;
1808 int length;
1809 int transferred;
1810 int i;
1811 int bufind;
1812 int pageind;
1813 int bhind;
1814 int offset;
1815 unsigned long blocknr;
1816 struct kiobuf * iobuf = NULL;
1817 struct page * map;
1818 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1820 if (!nr)
1821 return 0;
1824 * First, do some alignment and validity checks
1826 for (i = 0; i < nr; i++) {
1827 iobuf = iovec[i];
1828 if ((iobuf->offset & (size-1)) ||
1829 (iobuf->length & (size-1)))
1830 return -EINVAL;
1831 if (!iobuf->nr_pages)
1832 panic("brw_kiovec: iobuf not initialised");
1836 * OK to walk down the iovec doing page IO on each page we find.
1838 bufind = bhind = transferred = err = 0;
1839 for (i = 0; i < nr; i++) {
1840 iobuf = iovec[i];
1841 offset = iobuf->offset;
1842 length = iobuf->length;
1843 iobuf->errno = 0;
1845 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1846 map = iobuf->maplist[pageind];
1847 if (!map) {
1848 err = -EFAULT;
1849 goto error;
1852 while (length > 0) {
1853 blocknr = b[bufind++];
1854 tmp = get_unused_buffer_head(0);
1855 if (!tmp) {
1856 err = -ENOMEM;
1857 goto error;
1860 tmp->b_dev = B_FREE;
1861 tmp->b_size = size;
1862 set_bh_page(tmp, map, offset);
1863 tmp->b_this_page = tmp;
1865 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1866 tmp->b_dev = dev;
1867 tmp->b_blocknr = blocknr;
1868 tmp->b_state = 1 << BH_Mapped;
1869 tmp->b_kiobuf = iobuf;
1871 if (rw == WRITE) {
1872 set_bit(BH_Uptodate, &tmp->b_state);
1873 set_bit(BH_Dirty, &tmp->b_state);
1876 bh[bhind++] = tmp;
1877 length -= size;
1878 offset += size;
1880 atomic_inc(&iobuf->io_count);
1883 * Start the IO if we have got too much
1885 if (bhind >= KIO_MAX_SECTORS) {
1886 err = do_kio(rw, bhind, bh, size);
1887 if (err >= 0)
1888 transferred += err;
1889 else
1890 goto finished;
1891 bhind = 0;
1894 if (offset >= PAGE_SIZE) {
1895 offset = 0;
1896 break;
1898 } /* End of block loop */
1899 } /* End of page loop */
1900 } /* End of iovec loop */
1902 /* Is there any IO still left to submit? */
1903 if (bhind) {
1904 err = do_kio(rw, bhind, bh, size);
1905 if (err >= 0)
1906 transferred += err;
1907 else
1908 goto finished;
1911 finished:
1912 if (transferred)
1913 return transferred;
1914 return err;
1916 error:
1917 /* We got an error allocating the bh'es. Just free the current
1918 buffer_heads and exit. */
1919 spin_lock(&unused_list_lock);
1920 for (i = bhind; --i >= 0; ) {
1921 __put_unused_buffer_head(bh[bhind]);
1923 spin_unlock(&unused_list_lock);
1924 goto finished;
1928 * Start I/O on a page.
1929 * This function expects the page to be locked and may return
1930 * before I/O is complete. You then have to check page->locked,
1931 * page->uptodate, and maybe wait on page->wait.
1933 * brw_page() is SMP-safe, although it's being called with the
1934 * kernel lock held - but the code is ready.
1936 * FIXME: we need a swapper_inode->get_block function to remove
1937 * some of the bmap kludges and interface ugliness here.
1939 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1941 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1942 int nr, fresh /* temporary debugging flag */, block;
1944 if (!PageLocked(page))
1945 panic("brw_page: page not locked for I/O");
1946 // ClearPageError(page);
1948 * We pretty much rely on the page lock for this, because
1949 * create_page_buffers() might sleep.
1951 fresh = 0;
1952 if (!page->buffers) {
1953 create_page_buffers(rw, page, dev, b, size);
1954 fresh = 1;
1956 if (!page->buffers)
1957 BUG();
1959 head = page->buffers;
1960 bh = head;
1961 nr = 0;
1962 do {
1963 block = *(b++);
1965 if (fresh && (atomic_read(&bh->b_count) != 0))
1966 BUG();
1967 if (rw == READ) {
1968 if (!fresh)
1969 BUG();
1970 if (!buffer_uptodate(bh)) {
1971 arr[nr++] = bh;
1972 atomic_inc(&bh->b_count);
1974 } else { /* WRITE */
1975 if (!bh->b_blocknr) {
1976 if (!block)
1977 BUG();
1978 bh->b_blocknr = block;
1979 } else {
1980 if (!block)
1981 BUG();
1983 set_bit(BH_Uptodate, &bh->b_state);
1984 set_bit(BH_Dirty, &bh->b_state);
1985 arr[nr++] = bh;
1986 atomic_inc(&bh->b_count);
1988 bh = bh->b_this_page;
1989 } while (bh != head);
1990 if ((rw == READ) && nr) {
1991 if (Page_Uptodate(page))
1992 BUG();
1993 ll_rw_block(rw, nr, arr);
1994 } else {
1995 if (!nr && rw == READ) {
1996 SetPageUptodate(page);
1997 UnlockPage(page);
1999 if (nr && (rw == WRITE))
2000 ll_rw_block(rw, nr, arr);
2002 return 0;
2005 int block_symlink(struct inode *inode, const char *symname, int len)
2007 struct address_space *mapping = inode->i_mapping;
2008 struct page *page = grab_cache_page(mapping, 0);
2009 int err = -ENOMEM;
2010 char *kaddr;
2012 if (!page)
2013 goto fail;
2014 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2015 if (err)
2016 goto fail_map;
2017 kaddr = (char*)page_address(page);
2018 memcpy(kaddr, symname, len-1);
2019 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2021 * Notice that we are _not_ going to block here - end of page is
2022 * unmapped, so this will only try to map the rest of page, see
2023 * that it is unmapped (typically even will not look into inode -
2024 * ->i_size will be enough for everything) and zero it out.
2025 * OTOH it's obviously correct and should make the page up-to-date.
2027 err = mapping->a_ops->readpage(NULL, page);
2028 wait_on_page(page);
2029 page_cache_release(page);
2030 if (err < 0)
2031 goto fail;
2032 mark_inode_dirty(inode);
2033 return 0;
2034 fail_map:
2035 UnlockPage(page);
2036 page_cache_release(page);
2037 fail:
2038 return err;
2042 * Try to increase the number of buffers available: the size argument
2043 * is used to determine what kind of buffers we want.
2045 static int grow_buffers(int size)
2047 struct page * page;
2048 struct buffer_head *bh, *tmp;
2049 struct buffer_head * insert_point;
2050 int isize;
2052 if ((size & 511) || (size > PAGE_SIZE)) {
2053 printk("VFS: grow_buffers: size = %d\n",size);
2054 return 0;
2057 page = alloc_page(GFP_BUFFER);
2058 if (!page)
2059 goto out;
2060 bh = create_buffers(page, size, 0);
2061 if (!bh)
2062 goto no_buffer_head;
2064 isize = BUFSIZE_INDEX(size);
2066 spin_lock(&free_list[isize].lock);
2067 insert_point = free_list[isize].list;
2068 tmp = bh;
2069 while (1) {
2070 if (insert_point) {
2071 tmp->b_next_free = insert_point->b_next_free;
2072 tmp->b_prev_free = insert_point;
2073 insert_point->b_next_free->b_prev_free = tmp;
2074 insert_point->b_next_free = tmp;
2075 } else {
2076 tmp->b_prev_free = tmp;
2077 tmp->b_next_free = tmp;
2079 insert_point = tmp;
2080 if (tmp->b_this_page)
2081 tmp = tmp->b_this_page;
2082 else
2083 break;
2085 tmp->b_this_page = bh;
2086 free_list[isize].list = bh;
2087 spin_unlock(&free_list[isize].lock);
2089 page->buffers = bh;
2090 page->flags &= ~(1 << PG_referenced);
2091 lru_cache_add(page);
2092 atomic_inc(&buffermem_pages);
2093 return 1;
2095 no_buffer_head:
2096 page_cache_release(page);
2097 out:
2098 return 0;
2102 * Sync all the buffers on one page..
2104 * If we have old buffers that are locked, we'll
2105 * wait on them, but we won't wait on the new ones
2106 * we're writing out now.
2108 * This all is required so that we can free up memory
2109 * later.
2111 static void sync_page_buffers(struct buffer_head *bh, int wait)
2113 struct buffer_head * tmp = bh;
2115 do {
2116 struct buffer_head *p = tmp;
2117 tmp = tmp->b_this_page;
2118 if (buffer_locked(p)) {
2119 if (wait)
2120 __wait_on_buffer(p);
2121 } else if (buffer_dirty(p))
2122 ll_rw_block(WRITE, 1, &p);
2123 } while (tmp != bh);
2127 * Can the buffer be thrown out?
2129 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2130 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2133 * try_to_free_buffers() checks if all the buffers on this particular page
2134 * are unused, and free's the page if so.
2136 * Wake up bdflush() if this fails - if we're running low on memory due
2137 * to dirty buffers, we need to flush them out as quickly as possible.
2139 * NOTE: There are quite a number of ways that threads of control can
2140 * obtain a reference to a buffer head within a page. So we must
2141 * lock out all of these paths to cleanly toss the page.
2143 int try_to_free_buffers(struct page * page, int wait)
2145 struct buffer_head * tmp, * bh = page->buffers;
2146 int index = BUFSIZE_INDEX(bh->b_size);
2148 spin_lock(&lru_list_lock);
2149 write_lock(&hash_table_lock);
2150 spin_lock(&free_list[index].lock);
2151 tmp = bh;
2152 do {
2153 struct buffer_head *p = tmp;
2155 tmp = tmp->b_this_page;
2156 if (buffer_busy(p))
2157 goto busy_buffer_page;
2158 } while (tmp != bh);
2160 spin_lock(&unused_list_lock);
2161 tmp = bh;
2162 do {
2163 struct buffer_head * p = tmp;
2164 tmp = tmp->b_this_page;
2166 /* The buffer can be either on the regular
2167 * queues or on the free list..
2169 if (p->b_dev != B_FREE)
2170 __remove_from_queues(p);
2171 else
2172 __remove_from_free_list(p, index);
2173 __put_unused_buffer_head(p);
2174 } while (tmp != bh);
2175 spin_unlock(&unused_list_lock);
2177 /* Wake up anyone waiting for buffer heads */
2178 wake_up(&buffer_wait);
2180 /* And free the page */
2181 page->buffers = NULL;
2182 page_cache_release(page);
2183 spin_unlock(&free_list[index].lock);
2184 write_unlock(&hash_table_lock);
2185 spin_unlock(&lru_list_lock);
2186 return 1;
2188 busy_buffer_page:
2189 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2190 spin_unlock(&free_list[index].lock);
2191 write_unlock(&hash_table_lock);
2192 spin_unlock(&lru_list_lock);
2193 sync_page_buffers(bh, wait);
2194 return 0;
2197 /* ================== Debugging =================== */
2199 void show_buffers(void)
2201 #ifdef CONFIG_SMP
2202 struct buffer_head * bh;
2203 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2204 int protected = 0;
2205 int nlist;
2206 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2207 #endif
2209 printk("Buffer memory: %6dkB\n",
2210 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2212 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2213 if (!spin_trylock(&lru_list_lock))
2214 return;
2215 for(nlist = 0; nlist < NR_LIST; nlist++) {
2216 found = locked = dirty = used = lastused = protected = 0;
2217 bh = lru_list[nlist];
2218 if(!bh) continue;
2220 do {
2221 found++;
2222 if (buffer_locked(bh))
2223 locked++;
2224 if (buffer_protected(bh))
2225 protected++;
2226 if (buffer_dirty(bh))
2227 dirty++;
2228 if (atomic_read(&bh->b_count))
2229 used++, lastused = found;
2230 bh = bh->b_next_free;
2231 } while (bh != lru_list[nlist]);
2233 int tmp = nr_buffers_type[nlist];
2234 if (found != tmp)
2235 printk("%9s: BUG -> found %d, reported %d\n",
2236 buf_types[nlist], found, tmp);
2238 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2239 "%d locked, %d protected, %d dirty\n",
2240 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2241 used, lastused, locked, protected, dirty);
2243 spin_unlock(&lru_list_lock);
2244 #endif
2247 /* ===================== Init ======================= */
2250 * allocate the hash table and init the free list
2251 * Use gfp() for the hash table to decrease TLB misses, use
2252 * SLAB cache for buffer heads.
2254 void __init buffer_init(unsigned long mempages)
2256 int order, i;
2257 unsigned int nr_hash;
2259 /* The buffer cache hash table is less important these days,
2260 * trim it a bit.
2262 mempages >>= 14;
2264 mempages *= sizeof(struct buffer_head *);
2266 for (order = 0; (1 << order) < mempages; order++)
2269 /* try to allocate something until we get it or we're asking
2270 for something that is really too small */
2272 do {
2273 unsigned long tmp;
2275 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2276 bh_hash_mask = (nr_hash - 1);
2278 tmp = nr_hash;
2279 bh_hash_shift = 0;
2280 while((tmp >>= 1UL) != 0UL)
2281 bh_hash_shift++;
2283 hash_table = (struct buffer_head **)
2284 __get_free_pages(GFP_ATOMIC, order);
2285 } while (hash_table == NULL && --order > 0);
2286 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2287 nr_hash, order, (PAGE_SIZE << order));
2289 if (!hash_table)
2290 panic("Failed to allocate buffer hash table\n");
2292 /* Setup hash chains. */
2293 for(i = 0; i < nr_hash; i++)
2294 hash_table[i] = NULL;
2296 /* Setup free lists. */
2297 for(i = 0; i < NR_SIZES; i++) {
2298 free_list[i].list = NULL;
2299 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2302 /* Setup lru lists. */
2303 for(i = 0; i < NR_LIST; i++)
2304 lru_list[i] = NULL;
2306 bh_cachep = kmem_cache_create("buffer_head",
2307 sizeof(struct buffer_head),
2309 SLAB_HWCACHE_ALIGN, NULL, NULL);
2310 if(!bh_cachep)
2311 panic("Cannot create buffer head SLAB cache\n");
2315 /* ====================== bdflush support =================== */
2317 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2318 * response to dirty buffers. Once this process is activated, we write back
2319 * a limited number of buffers to the disks and then go back to sleep again.
2321 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2322 struct task_struct *bdflush_tsk = 0;
2324 void wakeup_bdflush(int block)
2326 DECLARE_WAITQUEUE(wait, current);
2328 if (current == bdflush_tsk)
2329 return;
2331 if (!block) {
2332 wake_up_process(bdflush_tsk);
2333 return;
2336 /* kflushd can wakeup us before we have a chance to
2337 go to sleep so we must be smart in handling
2338 this wakeup event from kflushd to avoid deadlocking in SMP
2339 (we are not holding any lock anymore in these two paths). */
2340 __set_current_state(TASK_UNINTERRUPTIBLE);
2341 add_wait_queue(&bdflush_done, &wait);
2343 wake_up_process(bdflush_tsk);
2344 schedule();
2346 remove_wait_queue(&bdflush_done, &wait);
2347 __set_current_state(TASK_RUNNING);
2350 /* This is the _only_ function that deals with flushing async writes
2351 to disk.
2352 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2353 as all dirty buffers lives _only_ in the DIRTY lru list.
2354 As we never browse the LOCKED and CLEAN lru lists they are infact
2355 completly useless. */
2356 static int flush_dirty_buffers(int check_flushtime)
2358 struct buffer_head * bh, *next;
2359 int flushed = 0, i;
2361 restart:
2362 spin_lock(&lru_list_lock);
2363 bh = lru_list[BUF_DIRTY];
2364 if (!bh)
2365 goto out_unlock;
2366 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2367 next = bh->b_next_free;
2369 if (!buffer_dirty(bh)) {
2370 __refile_buffer(bh);
2371 continue;
2373 if (buffer_locked(bh))
2374 continue;
2376 if (check_flushtime) {
2377 /* The dirty lru list is chronologically ordered so
2378 if the current bh is not yet timed out,
2379 then also all the following bhs
2380 will be too young. */
2381 if (time_before(jiffies, bh->b_flushtime))
2382 goto out_unlock;
2383 } else {
2384 if (++flushed > bdf_prm.b_un.ndirty)
2385 goto out_unlock;
2388 /* OK, now we are committed to write it out. */
2389 atomic_inc(&bh->b_count);
2390 spin_unlock(&lru_list_lock);
2391 ll_rw_block(WRITE, 1, &bh);
2392 atomic_dec(&bh->b_count);
2394 if (current->need_resched)
2395 schedule();
2396 goto restart;
2398 out_unlock:
2399 spin_unlock(&lru_list_lock);
2401 return flushed;
2405 * Here we attempt to write back old buffers. We also try to flush inodes
2406 * and supers as well, since this function is essentially "update", and
2407 * otherwise there would be no way of ensuring that these quantities ever
2408 * get written back. Ideally, we would have a timestamp on the inodes
2409 * and superblocks so that we could write back only the old ones as well
2412 static int sync_old_buffers(void)
2414 lock_kernel();
2415 sync_supers(0);
2416 sync_inodes(0);
2417 unlock_kernel();
2419 flush_dirty_buffers(1);
2420 /* must really sync all the active I/O request to disk here */
2421 run_task_queue(&tq_disk);
2422 return 0;
2425 int block_sync_page(struct page *page)
2427 run_task_queue(&tq_disk);
2428 return 0;
2431 /* This is the interface to bdflush. As we get more sophisticated, we can
2432 * pass tuning parameters to this "process", to adjust how it behaves.
2433 * We would want to verify each parameter, however, to make sure that it
2434 * is reasonable. */
2436 asmlinkage long sys_bdflush(int func, long data)
2438 if (!capable(CAP_SYS_ADMIN))
2439 return -EPERM;
2441 if (func == 1) {
2442 /* do_exit directly and let kupdate to do its work alone. */
2443 do_exit(0);
2444 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2445 a syscall that doesn't care about the current mm context. */
2446 int error;
2447 struct mm_struct *user_mm;
2450 * bdflush will spend all of it's time in kernel-space,
2451 * without touching user-space, so we can switch it into
2452 * 'lazy TLB mode' to reduce the cost of context-switches
2453 * to and from bdflush.
2455 user_mm = start_lazy_tlb();
2456 error = sync_old_buffers();
2457 end_lazy_tlb(user_mm);
2458 return error;
2459 #endif
2462 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2463 if (func >= 2) {
2464 int i = (func-2) >> 1;
2465 if (i >= 0 && i < N_PARAM) {
2466 if ((func & 1) == 0)
2467 return put_user(bdf_prm.data[i], (int*)data);
2469 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2470 bdf_prm.data[i] = data;
2471 return 0;
2474 return -EINVAL;
2477 /* Having func 0 used to launch the actual bdflush and then never
2478 * return (unless explicitly killed). We return zero here to
2479 * remain semi-compatible with present update(8) programs.
2481 return 0;
2485 * This is the actual bdflush daemon itself. It used to be started from
2486 * the syscall above, but now we launch it ourselves internally with
2487 * kernel_thread(...) directly after the first thread in init/main.c
2489 int bdflush(void *sem)
2491 struct task_struct *tsk = current;
2492 int flushed;
2494 * We have a bare-bones task_struct, and really should fill
2495 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2496 * display semi-sane things. Not real crucial though...
2499 tsk->session = 1;
2500 tsk->pgrp = 1;
2501 strcpy(tsk->comm, "kflushd");
2502 bdflush_tsk = tsk;
2504 /* avoid getting signals */
2505 spin_lock_irq(&tsk->sigmask_lock);
2506 flush_signals(tsk);
2507 sigfillset(&tsk->blocked);
2508 recalc_sigpending(tsk);
2509 spin_unlock_irq(&tsk->sigmask_lock);
2511 up((struct semaphore *)sem);
2513 for (;;) {
2514 CHECK_EMERGENCY_SYNC
2516 flushed = flush_dirty_buffers(0);
2518 /* If wakeup_bdflush will wakeup us
2519 after our bdflush_done wakeup, then
2520 we must make sure to not sleep
2521 in schedule_timeout otherwise
2522 wakeup_bdflush may wait for our
2523 bdflush_done wakeup that would never arrive
2524 (as we would be sleeping) and so it would
2525 deadlock in SMP. */
2526 __set_current_state(TASK_INTERRUPTIBLE);
2527 wake_up(&bdflush_done);
2529 * If there are still a lot of dirty buffers around,
2530 * skip the sleep and flush some more. Otherwise, we
2531 * go to sleep waiting a wakeup.
2533 if (!flushed || balance_dirty_state(NODEV) < 0)
2534 schedule();
2535 /* Remember to mark us as running otherwise
2536 the next schedule will block. */
2537 __set_current_state(TASK_RUNNING);
2542 * This is the kernel update daemon. It was used to live in userspace
2543 * but since it's need to run safely we want it unkillable by mistake.
2544 * You don't need to change your userspace configuration since
2545 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2547 int kupdate(void *sem)
2549 struct task_struct * tsk = current;
2550 int interval;
2552 tsk->session = 1;
2553 tsk->pgrp = 1;
2554 strcpy(tsk->comm, "kupdate");
2556 /* sigstop and sigcont will stop and wakeup kupdate */
2557 spin_lock_irq(&tsk->sigmask_lock);
2558 sigfillset(&tsk->blocked);
2559 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2560 recalc_sigpending(tsk);
2561 spin_unlock_irq(&tsk->sigmask_lock);
2563 up((struct semaphore *)sem);
2565 for (;;) {
2566 /* update interval */
2567 interval = bdf_prm.b_un.interval;
2568 if (interval) {
2569 tsk->state = TASK_INTERRUPTIBLE;
2570 schedule_timeout(interval);
2571 } else {
2572 stop_kupdate:
2573 tsk->state = TASK_STOPPED;
2574 schedule(); /* wait for SIGCONT */
2576 /* check for sigstop */
2577 if (signal_pending(tsk)) {
2578 int stopped = 0;
2579 spin_lock_irq(&tsk->sigmask_lock);
2580 if (sigismember(&tsk->signal, SIGSTOP)) {
2581 sigdelset(&tsk->signal, SIGSTOP);
2582 stopped = 1;
2584 recalc_sigpending(tsk);
2585 spin_unlock_irq(&tsk->sigmask_lock);
2586 if (stopped)
2587 goto stop_kupdate;
2589 #ifdef DEBUG
2590 printk("kupdate() activated...\n");
2591 #endif
2592 sync_old_buffers();
2596 static int __init bdflush_init(void)
2598 DECLARE_MUTEX_LOCKED(sem);
2599 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2600 down(&sem);
2601 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2602 down(&sem);
2603 return 0;
2606 module_init(bdflush_init)