- Linus: more PageDirty / swapcache handling
[davej-history.git] / fs / buffer.c
blobfff4bb9a2904648275301f554471377f6b39444c
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/malloc.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
49 #include <asm/uaccess.h>
50 #include <asm/io.h>
51 #include <asm/bitops.h>
52 #include <asm/mmu_context.h>
54 #define NR_SIZES 7
55 static char buffersize_index[65] =
56 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
57 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
58 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
59 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
60 6};
62 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
63 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
64 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
65 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
66 number of unused buffer heads */
68 /* Anti-deadlock ordering:
69 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
72 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
75 * Hash table gook..
77 static unsigned int bh_hash_mask;
78 static unsigned int bh_hash_shift;
79 static struct buffer_head **hash_table;
80 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
82 static struct buffer_head *lru_list[NR_LIST];
83 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
84 static int nr_buffers_type[NR_LIST];
85 static unsigned long size_buffers_type[NR_LIST];
87 static struct buffer_head * unused_list;
88 static int nr_unused_buffer_heads;
89 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
90 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
92 struct bh_free_head {
93 struct buffer_head *list;
94 spinlock_t lock;
96 static struct bh_free_head free_list[NR_SIZES];
98 static int grow_buffers(int size);
99 static void __refile_buffer(struct buffer_head *);
101 /* This is used by some architectures to estimate available memory. */
102 atomic_t buffermem_pages = ATOMIC_INIT(0);
104 /* Here is the parameter block for the bdflush process. If you add or
105 * remove any of the parameters, make sure to update kernel/sysctl.c.
108 #define N_PARAM 9
110 /* The dummy values in this structure are left in there for compatibility
111 * with old programs that play with the /proc entries.
113 union bdflush_param {
114 struct {
115 int nfract; /* Percentage of buffer cache dirty to
116 activate bdflush */
117 int ndirty; /* Maximum number of dirty blocks to write out per
118 wake-cycle */
119 int nrefill; /* Number of clean buffers to try to obtain
120 each time we call refill */
121 int nref_dirt; /* Dirty buffer threshold for activating bdflush
122 when trying to refill buffers. */
123 int interval; /* jiffies delay between kupdate flushes */
124 int age_buffer; /* Time for normal buffer to age before we flush it */
125 int dummy1; /* unused, was age_super */
126 int dummy2; /* unused */
127 int dummy3; /* unused */
128 } b_un;
129 unsigned int data[N_PARAM];
130 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
132 /* These are the min and max parameter values that we will allow to be assigned */
133 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
134 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
137 * Rewrote the wait-routines to use the "new" wait-queue functionality,
138 * and getting rid of the cli-sti pairs. The wait-queue routines still
139 * need cli-sti, but now it's just a couple of 386 instructions or so.
141 * Note that the real wait_on_buffer() is an inline function that checks
142 * if 'b_wait' is set before calling this, so that the queues aren't set
143 * up unnecessarily.
145 void __wait_on_buffer(struct buffer_head * bh)
147 struct task_struct *tsk = current;
148 DECLARE_WAITQUEUE(wait, tsk);
150 atomic_inc(&bh->b_count);
151 add_wait_queue(&bh->b_wait, &wait);
152 do {
153 run_task_queue(&tq_disk);
154 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
155 if (!buffer_locked(bh))
156 break;
157 schedule();
158 } while (buffer_locked(bh));
159 tsk->state = TASK_RUNNING;
160 remove_wait_queue(&bh->b_wait, &wait);
161 atomic_dec(&bh->b_count);
164 /* Call sync_buffers with wait!=0 to ensure that the call does not
165 * return until all buffer writes have completed. Sync() may return
166 * before the writes have finished; fsync() may not.
169 /* Godamity-damn. Some buffers (bitmaps for filesystems)
170 * spontaneously dirty themselves without ever brelse being called.
171 * We will ultimately want to put these in a separate list, but for
172 * now we search all of the lists for dirty buffers.
174 static int sync_buffers(kdev_t dev, int wait)
176 int i, retry, pass = 0, err = 0;
177 struct buffer_head * bh, *next;
179 /* One pass for no-wait, three for wait:
180 * 0) write out all dirty, unlocked buffers;
181 * 1) write out all dirty buffers, waiting if locked;
182 * 2) wait for completion by waiting for all buffers to unlock.
184 do {
185 retry = 0;
187 /* We search all lists as a failsafe mechanism, not because we expect
188 * there to be dirty buffers on any of the other lists.
190 repeat:
191 spin_lock(&lru_list_lock);
192 bh = lru_list[BUF_DIRTY];
193 if (!bh)
194 goto repeat2;
196 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
197 next = bh->b_next_free;
199 if (!lru_list[BUF_DIRTY])
200 break;
201 if (dev && bh->b_dev != dev)
202 continue;
203 if (buffer_locked(bh)) {
204 /* Buffer is locked; skip it unless wait is
205 * requested AND pass > 0.
207 if (!wait || !pass) {
208 retry = 1;
209 continue;
211 atomic_inc(&bh->b_count);
212 spin_unlock(&lru_list_lock);
213 wait_on_buffer (bh);
214 atomic_dec(&bh->b_count);
215 goto repeat;
218 /* If an unlocked buffer is not uptodate, there has
219 * been an IO error. Skip it.
221 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
222 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
223 err = -EIO;
224 continue;
227 /* Don't write clean buffers. Don't write ANY buffers
228 * on the third pass.
230 if (!buffer_dirty(bh) || pass >= 2)
231 continue;
233 atomic_inc(&bh->b_count);
234 spin_unlock(&lru_list_lock);
235 ll_rw_block(WRITE, 1, &bh);
236 atomic_dec(&bh->b_count);
237 retry = 1;
238 goto repeat;
241 repeat2:
242 bh = lru_list[BUF_LOCKED];
243 if (!bh) {
244 spin_unlock(&lru_list_lock);
245 break;
247 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
248 next = bh->b_next_free;
250 if (!lru_list[BUF_LOCKED])
251 break;
252 if (dev && bh->b_dev != dev)
253 continue;
254 if (buffer_locked(bh)) {
255 /* Buffer is locked; skip it unless wait is
256 * requested AND pass > 0.
258 if (!wait || !pass) {
259 retry = 1;
260 continue;
262 atomic_inc(&bh->b_count);
263 spin_unlock(&lru_list_lock);
264 wait_on_buffer (bh);
265 spin_lock(&lru_list_lock);
266 atomic_dec(&bh->b_count);
267 goto repeat2;
270 spin_unlock(&lru_list_lock);
272 /* If we are waiting for the sync to succeed, and if any dirty
273 * blocks were written, then repeat; on the second pass, only
274 * wait for buffers being written (do not pass to write any
275 * more buffers on the second pass).
277 } while (wait && retry && ++pass<=2);
278 return err;
281 void sync_dev(kdev_t dev)
283 sync_supers(dev);
284 sync_inodes(dev);
285 DQUOT_SYNC(dev);
286 /* sync all the dirty buffers out to disk only _after_ all the
287 high level layers finished generated buffer dirty data
288 (or we'll return with some buffer still dirty on the blockdevice
289 so breaking the semantics of this call) */
290 sync_buffers(dev, 0);
292 * FIXME(eric) we need to sync the physical devices here.
293 * This is because some (scsi) controllers have huge amounts of
294 * cache onboard (hundreds of Mb), and we need to instruct
295 * them to commit all of the dirty memory to disk, and we should
296 * not return until this has happened.
298 * This would need to get implemented by going through the assorted
299 * layers so that each block major number can be synced, and this
300 * would call down into the upper and mid-layer scsi.
304 int fsync_dev(kdev_t dev)
306 sync_buffers(dev, 0);
308 lock_kernel();
309 sync_supers(dev);
310 sync_inodes(dev);
311 DQUOT_SYNC(dev);
312 unlock_kernel();
314 return sync_buffers(dev, 1);
317 asmlinkage long sys_sync(void)
319 fsync_dev(0);
320 return 0;
324 * filp may be NULL if called via the msync of a vma.
327 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
329 struct inode * inode = dentry->d_inode;
330 struct super_block * sb;
331 kdev_t dev;
332 int ret;
334 lock_kernel();
335 /* sync the inode to buffers */
336 write_inode_now(inode, 0);
338 /* sync the superblock to buffers */
339 sb = inode->i_sb;
340 wait_on_super(sb);
341 if (sb->s_op && sb->s_op->write_super)
342 sb->s_op->write_super(sb);
344 /* .. finally sync the buffers to disk */
345 dev = inode->i_dev;
346 ret = sync_buffers(dev, 1);
347 unlock_kernel();
348 return ret;
351 asmlinkage long sys_fsync(unsigned int fd)
353 struct file * file;
354 struct dentry * dentry;
355 struct inode * inode;
356 int err;
358 err = -EBADF;
359 file = fget(fd);
360 if (!file)
361 goto out;
363 dentry = file->f_dentry;
364 inode = dentry->d_inode;
366 err = -EINVAL;
367 if (!file->f_op || !file->f_op->fsync)
368 goto out_putf;
370 /* We need to protect against concurrent writers.. */
371 down(&inode->i_sem);
372 err = file->f_op->fsync(file, dentry, 0);
373 up(&inode->i_sem);
375 out_putf:
376 fput(file);
377 out:
378 return err;
381 asmlinkage long sys_fdatasync(unsigned int fd)
383 struct file * file;
384 struct dentry * dentry;
385 struct inode * inode;
386 int err;
388 err = -EBADF;
389 file = fget(fd);
390 if (!file)
391 goto out;
393 dentry = file->f_dentry;
394 inode = dentry->d_inode;
396 err = -EINVAL;
397 if (!file->f_op || !file->f_op->fsync)
398 goto out_putf;
400 down(&inode->i_sem);
401 err = file->f_op->fsync(file, dentry, 1);
402 up(&inode->i_sem);
404 out_putf:
405 fput(file);
406 out:
407 return err;
410 /* After several hours of tedious analysis, the following hash
411 * function won. Do not mess with it... -DaveM
413 #define _hashfn(dev,block) \
414 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
415 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
416 ((block) << (bh_hash_shift - 12))))
417 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
419 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
421 if ((bh->b_next = *head) != NULL)
422 bh->b_next->b_pprev = &bh->b_next;
423 *head = bh;
424 bh->b_pprev = head;
427 static __inline__ void __hash_unlink(struct buffer_head *bh)
429 if (bh->b_pprev) {
430 if (bh->b_next)
431 bh->b_next->b_pprev = bh->b_pprev;
432 *(bh->b_pprev) = bh->b_next;
433 bh->b_pprev = NULL;
437 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
439 struct buffer_head **bhp = &lru_list[blist];
441 if(!*bhp) {
442 *bhp = bh;
443 bh->b_prev_free = bh;
445 bh->b_next_free = *bhp;
446 bh->b_prev_free = (*bhp)->b_prev_free;
447 (*bhp)->b_prev_free->b_next_free = bh;
448 (*bhp)->b_prev_free = bh;
449 nr_buffers_type[blist]++;
450 size_buffers_type[blist] += bh->b_size;
453 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
455 if (bh->b_prev_free || bh->b_next_free) {
456 bh->b_prev_free->b_next_free = bh->b_next_free;
457 bh->b_next_free->b_prev_free = bh->b_prev_free;
458 if (lru_list[blist] == bh)
459 lru_list[blist] = bh->b_next_free;
460 if (lru_list[blist] == bh)
461 lru_list[blist] = NULL;
462 bh->b_next_free = bh->b_prev_free = NULL;
463 nr_buffers_type[blist]--;
464 size_buffers_type[blist] -= bh->b_size;
468 static void __remove_from_free_list(struct buffer_head * bh, int index)
470 if(bh->b_next_free == bh)
471 free_list[index].list = NULL;
472 else {
473 bh->b_prev_free->b_next_free = bh->b_next_free;
474 bh->b_next_free->b_prev_free = bh->b_prev_free;
475 if (free_list[index].list == bh)
476 free_list[index].list = bh->b_next_free;
478 bh->b_next_free = bh->b_prev_free = NULL;
481 /* must be called with both the hash_table_lock and the lru_list_lock
482 held */
483 static void __remove_from_queues(struct buffer_head *bh)
485 __hash_unlink(bh);
486 __remove_from_lru_list(bh, bh->b_list);
489 static void __insert_into_queues(struct buffer_head *bh)
491 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
493 __hash_link(bh, head);
494 __insert_into_lru_list(bh, bh->b_list);
497 /* This function must only run if there are no other
498 * references _anywhere_ to this buffer head.
500 static void put_last_free(struct buffer_head * bh)
502 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
503 struct buffer_head **bhp = &head->list;
505 bh->b_state = 0;
507 spin_lock(&head->lock);
508 bh->b_dev = B_FREE;
509 if(!*bhp) {
510 *bhp = bh;
511 bh->b_prev_free = bh;
513 bh->b_next_free = *bhp;
514 bh->b_prev_free = (*bhp)->b_prev_free;
515 (*bhp)->b_prev_free->b_next_free = bh;
516 (*bhp)->b_prev_free = bh;
517 spin_unlock(&head->lock);
521 * Why like this, I hear you say... The reason is race-conditions.
522 * As we don't lock buffers (unless we are reading them, that is),
523 * something might happen to it while we sleep (ie a read-error
524 * will force it bad). This shouldn't really happen currently, but
525 * the code is ready.
527 static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size)
529 struct buffer_head *bh = hash(dev, block);
531 for (; bh; bh = bh->b_next)
532 if (bh->b_blocknr == block &&
533 bh->b_size == size &&
534 bh->b_dev == dev)
535 break;
536 if (bh)
537 atomic_inc(&bh->b_count);
539 return bh;
542 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
544 struct buffer_head *bh;
546 read_lock(&hash_table_lock);
547 bh = __get_hash_table(dev, block, size);
548 read_unlock(&hash_table_lock);
550 return bh;
553 unsigned int get_hardblocksize(kdev_t dev)
556 * Get the hard sector size for the given device. If we don't know
557 * what it is, return 0.
559 if (hardsect_size[MAJOR(dev)] != NULL) {
560 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
561 if (blksize != 0)
562 return blksize;
566 * We don't know what the hardware sector size for this device is.
567 * Return 0 indicating that we don't know.
569 return 0;
572 void buffer_insert_inode_queue(struct buffer_head *bh, struct inode *inode)
574 spin_lock(&lru_list_lock);
575 if (bh->b_inode)
576 list_del(&bh->b_inode_buffers);
577 bh->b_inode = inode;
578 list_add(&bh->b_inode_buffers, &inode->i_dirty_buffers);
579 spin_unlock(&lru_list_lock);
582 /* The caller must have the lru_list lock before calling the
583 remove_inode_queue functions. */
584 static void __remove_inode_queue(struct buffer_head *bh)
586 bh->b_inode = NULL;
587 list_del(&bh->b_inode_buffers);
590 static inline void remove_inode_queue(struct buffer_head *bh)
592 if (bh->b_inode)
593 __remove_inode_queue(bh);
596 int inode_has_buffers(struct inode *inode)
598 int ret;
600 spin_lock(&lru_list_lock);
601 ret = !list_empty(&inode->i_dirty_buffers);
602 spin_unlock(&lru_list_lock);
604 return ret;
608 /* If invalidate_buffers() will trash dirty buffers, it means some kind
609 of fs corruption is going on. Trashing dirty data always imply losing
610 information that was supposed to be just stored on the physical layer
611 by the user.
613 Thus invalidate_buffers in general usage is not allwowed to trash dirty
614 buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
616 NOTE: In the case where the user removed a removable-media-disk even if
617 there's still dirty data not synced on disk (due a bug in the device driver
618 or due an error of the user), by not destroying the dirty buffers we could
619 generate corruption also on the next media inserted, thus a parameter is
620 necessary to handle this case in the most safe way possible (trying
621 to not corrupt also the new disk inserted with the data belonging to
622 the old now corrupted disk). Also for the ramdisk the natural thing
623 to do in order to release the ramdisk memory is to destroy dirty buffers.
625 These are two special cases. Normal usage imply the device driver
626 to issue a sync on the device (without waiting I/O completation) and
627 then an invalidate_buffers call that doesn't trash dirty buffers. */
628 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
630 int i, nlist, slept;
631 struct buffer_head * bh, * bh_next;
633 retry:
634 slept = 0;
635 spin_lock(&lru_list_lock);
636 for(nlist = 0; nlist < NR_LIST; nlist++) {
637 bh = lru_list[nlist];
638 if (!bh)
639 continue;
640 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
641 bh_next = bh->b_next_free;
642 if (bh->b_dev != dev)
643 continue;
644 if (buffer_locked(bh)) {
645 atomic_inc(&bh->b_count);
646 spin_unlock(&lru_list_lock);
647 wait_on_buffer(bh);
648 slept = 1;
649 spin_lock(&lru_list_lock);
650 atomic_dec(&bh->b_count);
653 write_lock(&hash_table_lock);
654 if (!atomic_read(&bh->b_count) &&
655 (destroy_dirty_buffers || !buffer_dirty(bh))) {
656 remove_inode_queue(bh);
657 __remove_from_queues(bh);
658 put_last_free(bh);
660 /* else complain loudly? */
662 write_unlock(&hash_table_lock);
663 if (slept)
664 goto out;
667 out:
668 spin_unlock(&lru_list_lock);
669 if (slept)
670 goto retry;
673 void set_blocksize(kdev_t dev, int size)
675 extern int *blksize_size[];
676 int i, nlist, slept;
677 struct buffer_head * bh, * bh_next;
679 if (!blksize_size[MAJOR(dev)])
680 return;
682 /* Size must be a power of two, and between 512 and PAGE_SIZE */
683 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
684 panic("Invalid blocksize passed to set_blocksize");
686 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
687 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
688 return;
690 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
691 return;
692 sync_buffers(dev, 2);
693 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
695 retry:
696 slept = 0;
697 spin_lock(&lru_list_lock);
698 for(nlist = 0; nlist < NR_LIST; nlist++) {
699 bh = lru_list[nlist];
700 if (!bh)
701 continue;
702 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
703 bh_next = bh->b_next_free;
704 if (bh->b_dev != dev || bh->b_size == size)
705 continue;
706 if (buffer_locked(bh)) {
707 atomic_inc(&bh->b_count);
708 spin_unlock(&lru_list_lock);
709 wait_on_buffer(bh);
710 slept = 1;
711 spin_lock(&lru_list_lock);
712 atomic_dec(&bh->b_count);
715 write_lock(&hash_table_lock);
716 if (!atomic_read(&bh->b_count)) {
717 if (buffer_dirty(bh))
718 printk(KERN_WARNING
719 "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
720 kdevname(dev), bh->b_blocknr, bh->b_size);
721 remove_inode_queue(bh);
722 __remove_from_queues(bh);
723 put_last_free(bh);
724 } else {
725 if (atomic_set_buffer_clean(bh))
726 __refile_buffer(bh);
727 clear_bit(BH_Uptodate, &bh->b_state);
728 printk(KERN_WARNING
729 "set_blocksize: "
730 "b_count %d, dev %s, block %lu, from %p\n",
731 atomic_read(&bh->b_count), bdevname(bh->b_dev),
732 bh->b_blocknr, __builtin_return_address(0));
734 write_unlock(&hash_table_lock);
735 if (slept)
736 goto out;
739 out:
740 spin_unlock(&lru_list_lock);
741 if (slept)
742 goto retry;
746 * We used to try various strange things. Let's not.
748 static void refill_freelist(int size)
750 if (!grow_buffers(size))
751 wakeup_bdflush(1); /* Sets task->state to TASK_RUNNING */
754 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
756 bh->b_list = BUF_CLEAN;
757 bh->b_end_io = handler;
758 bh->b_private = private;
761 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
763 mark_buffer_uptodate(bh, uptodate);
764 unlock_buffer(bh);
767 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
769 mark_buffer_uptodate(bh, uptodate);
770 unlock_buffer(bh);
771 BUG();
774 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
776 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
777 unsigned long flags;
778 struct buffer_head *tmp;
779 struct page *page;
781 mark_buffer_uptodate(bh, uptodate);
783 /* This is a temporary buffer used for page I/O. */
784 page = bh->b_page;
786 if (!uptodate)
787 SetPageError(page);
790 * Be _very_ careful from here on. Bad things can happen if
791 * two buffer heads end IO at almost the same time and both
792 * decide that the page is now completely done.
794 * Async buffer_heads are here only as labels for IO, and get
795 * thrown away once the IO for this page is complete. IO is
796 * deemed complete once all buffers have been visited
797 * (b_count==0) and are now unlocked. We must make sure that
798 * only the _last_ buffer that decrements its count is the one
799 * that unlock the page..
801 spin_lock_irqsave(&page_uptodate_lock, flags);
802 unlock_buffer(bh);
803 atomic_dec(&bh->b_count);
804 tmp = bh->b_this_page;
805 while (tmp != bh) {
806 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
807 goto still_busy;
808 tmp = tmp->b_this_page;
811 /* OK, the async IO on this page is complete. */
812 spin_unlock_irqrestore(&page_uptodate_lock, flags);
815 * if none of the buffers had errors then we can set the
816 * page uptodate:
818 if (!PageError(page))
819 SetPageUptodate(page);
822 * Run the hooks that have to be done when a page I/O has completed.
824 if (PageTestandClearDecrAfter(page))
825 atomic_dec(&nr_async_pages);
827 UnlockPage(page);
829 return;
831 still_busy:
832 spin_unlock_irqrestore(&page_uptodate_lock, flags);
833 return;
837 * Synchronise all the inode's dirty buffers to the disk.
839 * We have conflicting pressures: we want to make sure that all
840 * initially dirty buffers get waited on, but that any subsequently
841 * dirtied buffers don't. After all, we don't want fsync to last
842 * forever if somebody is actively writing to the file.
844 * Do this in two main stages: first we copy dirty buffers to a
845 * temporary inode list, queueing the writes as we go. Then we clean
846 * up, waiting for those writes to complete.
848 * During this second stage, any subsequent updates to the file may end
849 * up refiling the buffer on the original inode's dirty list again, so
850 * there is a chance we will end up with a buffer queued for write but
851 * not yet completed on that list. So, as a final cleanup we go through
852 * the osync code to catch these locked, dirty buffers without requeuing
853 * any newly dirty buffers for write.
856 int fsync_inode_buffers(struct inode *inode)
858 struct buffer_head *bh;
859 struct inode tmp;
860 int err = 0, err2;
862 INIT_LIST_HEAD(&tmp.i_dirty_buffers);
864 spin_lock(&lru_list_lock);
866 while (!list_empty(&inode->i_dirty_buffers)) {
867 bh = BH_ENTRY(inode->i_dirty_buffers.next);
868 list_del(&bh->b_inode_buffers);
869 if (!buffer_dirty(bh) && !buffer_locked(bh))
870 bh->b_inode = NULL;
871 else {
872 bh->b_inode = &tmp;
873 list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
874 atomic_inc(&bh->b_count);
875 if (buffer_dirty(bh)) {
876 spin_unlock(&lru_list_lock);
877 ll_rw_block(WRITE, 1, &bh);
878 spin_lock(&lru_list_lock);
883 while (!list_empty(&tmp.i_dirty_buffers)) {
884 bh = BH_ENTRY(tmp.i_dirty_buffers.prev);
885 remove_inode_queue(bh);
886 spin_unlock(&lru_list_lock);
887 wait_on_buffer(bh);
888 if (!buffer_uptodate(bh))
889 err = -EIO;
890 brelse(bh);
891 spin_lock(&lru_list_lock);
894 spin_unlock(&lru_list_lock);
895 err2 = osync_inode_buffers(inode);
897 if (err)
898 return err;
899 else
900 return err2;
905 * osync is designed to support O_SYNC io. It waits synchronously for
906 * all already-submitted IO to complete, but does not queue any new
907 * writes to the disk.
909 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
910 * you dirty the buffers, and then use osync_inode_buffers to wait for
911 * completion. Any other dirty buffers which are not yet queued for
912 * write will not be flushed to disk by the osync.
915 int osync_inode_buffers(struct inode *inode)
917 struct buffer_head *bh;
918 struct list_head *list;
919 int err = 0;
921 spin_lock(&lru_list_lock);
923 repeat:
925 for (list = inode->i_dirty_buffers.prev;
926 bh = BH_ENTRY(list), list != &inode->i_dirty_buffers;
927 list = bh->b_inode_buffers.prev) {
928 if (buffer_locked(bh)) {
929 atomic_inc(&bh->b_count);
930 spin_unlock(&lru_list_lock);
931 wait_on_buffer(bh);
932 brelse(bh);
933 if (!buffer_uptodate(bh))
934 err = -EIO;
935 spin_lock(&lru_list_lock);
936 goto repeat;
940 spin_unlock(&lru_list_lock);
941 return err;
946 * Invalidate any and all dirty buffers on a given inode. We are
947 * probably unmounting the fs, but that doesn't mean we have already
948 * done a sync(). Just drop the buffers from the inode list.
951 void invalidate_inode_buffers(struct inode *inode)
953 struct list_head *list, *next;
955 spin_lock(&lru_list_lock);
956 list = inode->i_dirty_buffers.next;
957 while (list != &inode->i_dirty_buffers) {
958 next = list->next;
959 remove_inode_queue(BH_ENTRY(list));
960 list = next;
962 spin_unlock(&lru_list_lock);
967 * Ok, this is getblk, and it isn't very clear, again to hinder
968 * race-conditions. Most of the code is seldom used, (ie repeating),
969 * so it should be much more efficient than it looks.
971 * The algorithm is changed: hopefully better, and an elusive bug removed.
973 * 14.02.92: changed it to sync dirty buffers a bit: better performance
974 * when the filesystem starts to get full of dirty blocks (I hope).
976 struct buffer_head * getblk(kdev_t dev, int block, int size)
978 struct buffer_head * bh;
979 int isize;
981 repeat:
982 spin_lock(&lru_list_lock);
983 write_lock(&hash_table_lock);
984 bh = __get_hash_table(dev, block, size);
985 if (bh)
986 goto out;
988 isize = BUFSIZE_INDEX(size);
989 spin_lock(&free_list[isize].lock);
990 bh = free_list[isize].list;
991 if (bh) {
992 __remove_from_free_list(bh, isize);
993 atomic_set(&bh->b_count, 1);
995 spin_unlock(&free_list[isize].lock);
998 * OK, FINALLY we know that this buffer is the only one of
999 * its kind, we hold a reference (b_count>0), it is unlocked,
1000 * and it is clean.
1002 if (bh) {
1003 init_buffer(bh, end_buffer_io_sync, NULL);
1004 bh->b_dev = dev;
1005 bh->b_blocknr = block;
1006 bh->b_state = 1 << BH_Mapped;
1008 /* Insert the buffer into the regular lists */
1009 __insert_into_queues(bh);
1010 out:
1011 write_unlock(&hash_table_lock);
1012 spin_unlock(&lru_list_lock);
1013 touch_buffer(bh);
1014 return bh;
1018 * If we block while refilling the free list, somebody may
1019 * create the buffer first ... search the hashes again.
1021 write_unlock(&hash_table_lock);
1022 spin_unlock(&lru_list_lock);
1023 refill_freelist(size);
1024 goto repeat;
1027 /* -1 -> no need to flush
1028 0 -> async flush
1029 1 -> sync flush (wait for I/O completation) */
1030 int balance_dirty_state(kdev_t dev)
1032 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1033 int shortage;
1035 dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
1036 tot = nr_free_buffer_pages();
1038 dirty *= 200;
1039 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1040 hard_dirty_limit = soft_dirty_limit * 2;
1042 /* First, check for the "real" dirty limit. */
1043 if (dirty > soft_dirty_limit) {
1044 if (dirty > hard_dirty_limit)
1045 return 1;
1046 return 0;
1050 * If we are about to get low on free pages and
1051 * cleaning the inactive_dirty pages would help
1052 * fix this, wake up bdflush.
1054 shortage = free_shortage();
1055 if (shortage && nr_inactive_dirty_pages > shortage &&
1056 nr_inactive_dirty_pages > freepages.high)
1057 return 0;
1059 return -1;
1063 * if a new dirty buffer is created we need to balance bdflush.
1065 * in the future we might want to make bdflush aware of different
1066 * pressures on different devices - thus the (currently unused)
1067 * 'dev' parameter.
1069 void balance_dirty(kdev_t dev)
1071 int state = balance_dirty_state(dev);
1073 if (state < 0)
1074 return;
1075 wakeup_bdflush(state);
1078 static __inline__ void __mark_dirty(struct buffer_head *bh)
1080 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1081 refile_buffer(bh);
1084 /* atomic version, the user must call balance_dirty() by hand
1085 as soon as it become possible to block */
1086 void __mark_buffer_dirty(struct buffer_head *bh)
1088 if (!atomic_set_buffer_dirty(bh))
1089 __mark_dirty(bh);
1092 void mark_buffer_dirty(struct buffer_head *bh)
1094 __mark_buffer_dirty(bh);
1095 balance_dirty(bh->b_dev);
1099 * A buffer may need to be moved from one buffer list to another
1100 * (e.g. in case it is not shared any more). Handle this.
1102 static void __refile_buffer(struct buffer_head *bh)
1104 int dispose = BUF_CLEAN;
1105 if (buffer_locked(bh))
1106 dispose = BUF_LOCKED;
1107 if (buffer_dirty(bh))
1108 dispose = BUF_DIRTY;
1109 if (buffer_protected(bh))
1110 dispose = BUF_PROTECTED;
1111 if (dispose != bh->b_list) {
1112 __remove_from_lru_list(bh, bh->b_list);
1113 bh->b_list = dispose;
1114 if (dispose == BUF_CLEAN)
1115 remove_inode_queue(bh);
1116 __insert_into_lru_list(bh, dispose);
1120 void refile_buffer(struct buffer_head *bh)
1122 spin_lock(&lru_list_lock);
1123 __refile_buffer(bh);
1124 spin_unlock(&lru_list_lock);
1128 * Release a buffer head
1130 void __brelse(struct buffer_head * buf)
1132 if (atomic_read(&buf->b_count)) {
1133 atomic_dec(&buf->b_count);
1134 return;
1136 printk("VFS: brelse: Trying to free free buffer\n");
1140 * bforget() is like brelse(), except it puts the buffer on the
1141 * free list if it can.. We can NOT free the buffer if:
1142 * - there are other users of it
1143 * - it is locked and thus can have active IO
1145 void __bforget(struct buffer_head * buf)
1147 /* grab the lru lock here to block bdflush. */
1148 spin_lock(&lru_list_lock);
1149 write_lock(&hash_table_lock);
1150 if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
1151 goto in_use;
1152 __hash_unlink(buf);
1153 remove_inode_queue(buf);
1154 write_unlock(&hash_table_lock);
1155 __remove_from_lru_list(buf, buf->b_list);
1156 spin_unlock(&lru_list_lock);
1157 put_last_free(buf);
1158 return;
1160 in_use:
1161 write_unlock(&hash_table_lock);
1162 spin_unlock(&lru_list_lock);
1166 * bread() reads a specified block and returns the buffer that contains
1167 * it. It returns NULL if the block was unreadable.
1169 struct buffer_head * bread(kdev_t dev, int block, int size)
1171 struct buffer_head * bh;
1173 bh = getblk(dev, block, size);
1174 if (buffer_uptodate(bh))
1175 return bh;
1176 ll_rw_block(READ, 1, &bh);
1177 wait_on_buffer(bh);
1178 if (buffer_uptodate(bh))
1179 return bh;
1180 brelse(bh);
1181 return NULL;
1185 * Ok, breada can be used as bread, but additionally to mark other
1186 * blocks for reading as well. End the argument list with a negative
1187 * number.
1190 #define NBUF 16
1192 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1193 unsigned int pos, unsigned int filesize)
1195 struct buffer_head * bhlist[NBUF];
1196 unsigned int blocks;
1197 struct buffer_head * bh;
1198 int index;
1199 int i, j;
1201 if (pos >= filesize)
1202 return NULL;
1204 if (block < 0)
1205 return NULL;
1207 bh = getblk(dev, block, bufsize);
1208 index = BUFSIZE_INDEX(bh->b_size);
1210 if (buffer_uptodate(bh))
1211 return(bh);
1212 else ll_rw_block(READ, 1, &bh);
1214 blocks = (filesize - pos) >> (9+index);
1216 if (blocks > NBUF)
1217 blocks = NBUF;
1219 bhlist[0] = bh;
1220 j = 1;
1221 for(i=1; i<blocks; i++) {
1222 bh = getblk(dev,block+i,bufsize);
1223 if (buffer_uptodate(bh)) {
1224 brelse(bh);
1225 break;
1227 else bhlist[j++] = bh;
1230 /* Request the read for these buffers, and then release them. */
1231 if (j>1)
1232 ll_rw_block(READA, (j-1), bhlist+1);
1233 for(i=1; i<j; i++)
1234 brelse(bhlist[i]);
1236 /* Wait for this buffer, and then continue on. */
1237 bh = bhlist[0];
1238 wait_on_buffer(bh);
1239 if (buffer_uptodate(bh))
1240 return bh;
1241 brelse(bh);
1242 return NULL;
1246 * Note: the caller should wake up the buffer_wait list if needed.
1248 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1250 if (bh->b_inode)
1251 BUG();
1252 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1253 kmem_cache_free(bh_cachep, bh);
1254 } else {
1255 bh->b_blocknr = -1;
1256 init_waitqueue_head(&bh->b_wait);
1257 nr_unused_buffer_heads++;
1258 bh->b_next_free = unused_list;
1259 bh->b_this_page = NULL;
1260 unused_list = bh;
1265 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1266 * no-buffer-head deadlock. Return NULL on failure; waiting for
1267 * buffer heads is now handled in create_buffers().
1269 static struct buffer_head * get_unused_buffer_head(int async)
1271 struct buffer_head * bh;
1273 spin_lock(&unused_list_lock);
1274 if (nr_unused_buffer_heads > NR_RESERVED) {
1275 bh = unused_list;
1276 unused_list = bh->b_next_free;
1277 nr_unused_buffer_heads--;
1278 spin_unlock(&unused_list_lock);
1279 return bh;
1281 spin_unlock(&unused_list_lock);
1283 /* This is critical. We can't swap out pages to get
1284 * more buffer heads, because the swap-out may need
1285 * more buffer-heads itself. Thus SLAB_BUFFER.
1287 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1288 memset(bh, 0, sizeof(*bh));
1289 init_waitqueue_head(&bh->b_wait);
1290 return bh;
1294 * If we need an async buffer, use the reserved buffer heads.
1296 if (async) {
1297 spin_lock(&unused_list_lock);
1298 if (unused_list) {
1299 bh = unused_list;
1300 unused_list = bh->b_next_free;
1301 nr_unused_buffer_heads--;
1302 spin_unlock(&unused_list_lock);
1303 return bh;
1305 spin_unlock(&unused_list_lock);
1307 #if 0
1309 * (Pending further analysis ...)
1310 * Ordinary (non-async) requests can use a different memory priority
1311 * to free up pages. Any swapping thus generated will use async
1312 * buffer heads.
1314 if(!async &&
1315 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1316 memset(bh, 0, sizeof(*bh));
1317 init_waitqueue_head(&bh->b_wait);
1318 return bh;
1320 #endif
1322 return NULL;
1325 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1327 bh->b_page = page;
1328 if (offset >= PAGE_SIZE)
1329 BUG();
1330 if (PageHighMem(page))
1332 * This catches illegal uses and preserves the offset:
1334 bh->b_data = (char *)(0 + offset);
1335 else
1336 bh->b_data = page_address(page) + offset;
1340 * Create the appropriate buffers when given a page for data area and
1341 * the size of each buffer.. Use the bh->b_this_page linked list to
1342 * follow the buffers created. Return NULL if unable to create more
1343 * buffers.
1344 * The async flag is used to differentiate async IO (paging, swapping)
1345 * from ordinary buffer allocations, and only async requests are allowed
1346 * to sleep waiting for buffer heads.
1348 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1350 struct buffer_head *bh, *head;
1351 long offset;
1353 try_again:
1354 head = NULL;
1355 offset = PAGE_SIZE;
1356 while ((offset -= size) >= 0) {
1357 bh = get_unused_buffer_head(async);
1358 if (!bh)
1359 goto no_grow;
1361 bh->b_dev = B_FREE; /* Flag as unused */
1362 bh->b_this_page = head;
1363 head = bh;
1365 bh->b_state = 0;
1366 bh->b_next_free = NULL;
1367 bh->b_pprev = NULL;
1368 atomic_set(&bh->b_count, 0);
1369 bh->b_size = size;
1371 set_bh_page(bh, page, offset);
1373 bh->b_list = BUF_CLEAN;
1374 bh->b_end_io = end_buffer_io_bad;
1376 return head;
1378 * In case anything failed, we just free everything we got.
1380 no_grow:
1381 if (head) {
1382 spin_lock(&unused_list_lock);
1383 do {
1384 bh = head;
1385 head = head->b_this_page;
1386 __put_unused_buffer_head(bh);
1387 } while (head);
1388 spin_unlock(&unused_list_lock);
1390 /* Wake up any waiters ... */
1391 wake_up(&buffer_wait);
1395 * Return failure for non-async IO requests. Async IO requests
1396 * are not allowed to fail, so we have to wait until buffer heads
1397 * become available. But we don't want tasks sleeping with
1398 * partially complete buffers, so all were released above.
1400 if (!async)
1401 return NULL;
1403 /* We're _really_ low on memory. Now we just
1404 * wait for old buffer heads to become free due to
1405 * finishing IO. Since this is an async request and
1406 * the reserve list is empty, we're sure there are
1407 * async buffer heads in use.
1409 run_task_queue(&tq_disk);
1412 * Set our state for sleeping, then check again for buffer heads.
1413 * This ensures we won't miss a wake_up from an interrupt.
1415 wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1416 goto try_again;
1419 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1421 struct buffer_head *head, *bh, *tail;
1422 int block;
1424 if (!PageLocked(page))
1425 BUG();
1427 * Allocate async buffer heads pointing to this page, just for I/O.
1428 * They don't show up in the buffer hash table, but they *are*
1429 * registered in page->buffers.
1431 head = create_buffers(page, size, 1);
1432 if (page->buffers)
1433 BUG();
1434 if (!head)
1435 BUG();
1436 tail = head;
1437 for (bh = head; bh; bh = bh->b_this_page) {
1438 block = *(b++);
1440 tail = bh;
1441 init_buffer(bh, end_buffer_io_async, NULL);
1442 bh->b_dev = dev;
1443 bh->b_blocknr = block;
1445 set_bit(BH_Mapped, &bh->b_state);
1447 tail->b_this_page = head;
1448 page_cache_get(page);
1449 page->buffers = head;
1450 return 0;
1453 static void unmap_buffer(struct buffer_head * bh)
1455 if (buffer_mapped(bh)) {
1456 mark_buffer_clean(bh);
1457 wait_on_buffer(bh);
1458 clear_bit(BH_Uptodate, &bh->b_state);
1459 clear_bit(BH_Mapped, &bh->b_state);
1460 clear_bit(BH_Req, &bh->b_state);
1461 clear_bit(BH_New, &bh->b_state);
1466 * We don't have to release all buffers here, but
1467 * we have to be sure that no dirty buffer is left
1468 * and no IO is going on (no buffer is locked), because
1469 * we have truncated the file and are going to free the
1470 * blocks on-disk..
1472 int block_flushpage(struct page *page, unsigned long offset)
1474 struct buffer_head *head, *bh, *next;
1475 unsigned int curr_off = 0;
1477 if (!PageLocked(page))
1478 BUG();
1479 if (!page->buffers)
1480 return 1;
1482 head = page->buffers;
1483 bh = head;
1484 do {
1485 unsigned int next_off = curr_off + bh->b_size;
1486 next = bh->b_this_page;
1489 * is this block fully flushed?
1491 if (offset <= curr_off)
1492 unmap_buffer(bh);
1493 curr_off = next_off;
1494 bh = next;
1495 } while (bh != head);
1498 * subtle. We release buffer-heads only if this is
1499 * the 'final' flushpage. We have invalidated the get_block
1500 * cached value unconditionally, so real IO is not
1501 * possible anymore.
1503 * If the free doesn't work out, the buffers can be
1504 * left around - they just turn into anonymous buffers
1505 * instead.
1507 if (!offset) {
1508 if (!try_to_free_buffers(page, 0)) {
1509 atomic_inc(&buffermem_pages);
1510 return 0;
1514 return 1;
1517 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1519 struct buffer_head *bh, *head, *tail;
1521 head = create_buffers(page, blocksize, 1);
1522 if (page->buffers)
1523 BUG();
1525 bh = head;
1526 do {
1527 bh->b_dev = inode->i_dev;
1528 bh->b_blocknr = 0;
1529 bh->b_end_io = end_buffer_io_bad;
1530 tail = bh;
1531 bh = bh->b_this_page;
1532 } while (bh);
1533 tail->b_this_page = head;
1534 page->buffers = head;
1535 page_cache_get(page);
1539 * We are taking a block for data and we don't want any output from any
1540 * buffer-cache aliases starting from return from that function and
1541 * until the moment when something will explicitly mark the buffer
1542 * dirty (hopefully that will not happen until we will free that block ;-)
1543 * We don't even need to mark it not-uptodate - nobody can expect
1544 * anything from a newly allocated buffer anyway. We used to used
1545 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1546 * don't want to mark the alias unmapped, for example - it would confuse
1547 * anyone who might pick it with bread() afterwards...
1550 static void unmap_underlying_metadata(struct buffer_head * bh)
1552 struct buffer_head *old_bh;
1554 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1555 if (old_bh) {
1556 mark_buffer_clean(old_bh);
1557 wait_on_buffer(old_bh);
1558 clear_bit(BH_Req, &old_bh->b_state);
1559 /* Here we could run brelse or bforget. We use
1560 bforget because it will try to put the buffer
1561 in the freelist. */
1562 __bforget(old_bh);
1567 * NOTE! All mapped/uptodate combinations are valid:
1569 * Mapped Uptodate Meaning
1571 * No No "unknown" - must do get_block()
1572 * No Yes "hole" - zero-filled
1573 * Yes No "allocated" - allocated on disk, not read in
1574 * Yes Yes "valid" - allocated and up-to-date in memory.
1576 * "Dirty" is valid only with the last case (mapped+uptodate).
1580 * block_write_full_page() is SMP-safe - currently it's still
1581 * being called with the kernel lock held, but the code is ready.
1583 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1585 int err, i, need_balance_dirty = 0;
1586 unsigned long block;
1587 struct buffer_head *bh, *head;
1589 if (!PageLocked(page))
1590 BUG();
1592 if (!page->buffers)
1593 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1594 head = page->buffers;
1596 block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1598 bh = head;
1599 i = 0;
1600 do {
1602 * If the buffer isn't up-to-date, we can't be sure
1603 * that the buffer has been initialized with the proper
1604 * block number information etc..
1606 * Leave it to the low-level FS to make all those
1607 * decisions (block #0 may actually be a valid block)
1609 bh->b_end_io = end_buffer_io_sync;
1610 if (!buffer_mapped(bh)) {
1611 err = get_block(inode, block, bh, 1);
1612 if (err)
1613 goto out;
1614 if (buffer_new(bh))
1615 unmap_underlying_metadata(bh);
1617 set_bit(BH_Uptodate, &bh->b_state);
1618 if (!atomic_set_buffer_dirty(bh)) {
1619 buffer_insert_inode_queue(bh, inode);
1620 __mark_dirty(bh);
1621 need_balance_dirty = 1;
1624 bh = bh->b_this_page;
1625 block++;
1626 } while (bh != head);
1628 if (need_balance_dirty)
1629 balance_dirty(bh->b_dev);
1631 SetPageUptodate(page);
1632 return 0;
1633 out:
1634 ClearPageUptodate(page);
1635 return err;
1638 static int __block_prepare_write(struct inode *inode, struct page *page,
1639 unsigned from, unsigned to, get_block_t *get_block)
1641 unsigned block_start, block_end;
1642 unsigned long block;
1643 int err = 0;
1644 unsigned blocksize, bbits;
1645 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1646 char *kaddr = kmap(page);
1648 blocksize = inode->i_sb->s_blocksize;
1649 if (!page->buffers)
1650 create_empty_buffers(page, inode, blocksize);
1651 head = page->buffers;
1653 bbits = inode->i_sb->s_blocksize_bits;
1654 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1656 for(bh = head, block_start = 0; bh != head || !block_start;
1657 block++, block_start=block_end, bh = bh->b_this_page) {
1658 if (!bh)
1659 BUG();
1660 block_end = block_start+blocksize;
1661 if (block_end <= from)
1662 continue;
1663 if (block_start >= to)
1664 break;
1665 bh->b_end_io = end_buffer_io_sync;
1666 if (!buffer_mapped(bh)) {
1667 err = get_block(inode, block, bh, 1);
1668 if (err)
1669 goto out;
1670 if (buffer_new(bh)) {
1671 unmap_underlying_metadata(bh);
1672 if (Page_Uptodate(page)) {
1673 set_bit(BH_Uptodate, &bh->b_state);
1674 continue;
1676 if (block_end > to)
1677 memset(kaddr+to, 0, block_end-to);
1678 if (block_start < from)
1679 memset(kaddr+block_start, 0, from-block_start);
1680 if (block_end > to || block_start < from)
1681 flush_dcache_page(page);
1682 continue;
1685 if (Page_Uptodate(page)) {
1686 set_bit(BH_Uptodate, &bh->b_state);
1687 continue;
1689 if (!buffer_uptodate(bh) &&
1690 (block_start < from || block_end > to)) {
1691 ll_rw_block(READ, 1, &bh);
1692 *wait_bh++=bh;
1696 * If we issued read requests - let them complete.
1698 while(wait_bh > wait) {
1699 wait_on_buffer(*--wait_bh);
1700 err = -EIO;
1701 if (!buffer_uptodate(*wait_bh))
1702 goto out;
1704 return 0;
1705 out:
1706 return err;
1709 static int __block_commit_write(struct inode *inode, struct page *page,
1710 unsigned from, unsigned to)
1712 unsigned block_start, block_end;
1713 int partial = 0, need_balance_dirty = 0;
1714 unsigned blocksize;
1715 struct buffer_head *bh, *head;
1717 blocksize = inode->i_sb->s_blocksize;
1719 for(bh = head = page->buffers, block_start = 0;
1720 bh != head || !block_start;
1721 block_start=block_end, bh = bh->b_this_page) {
1722 block_end = block_start + blocksize;
1723 if (block_end <= from || block_start >= to) {
1724 if (!buffer_uptodate(bh))
1725 partial = 1;
1726 } else {
1727 set_bit(BH_Uptodate, &bh->b_state);
1728 if (!atomic_set_buffer_dirty(bh)) {
1729 __mark_dirty(bh);
1730 buffer_insert_inode_queue(bh, inode);
1731 need_balance_dirty = 1;
1736 if (need_balance_dirty)
1737 balance_dirty(bh->b_dev);
1739 * is this a partial write that happened to make all buffers
1740 * uptodate then we can optimize away a bogus readpage() for
1741 * the next read(). Here we 'discover' wether the page went
1742 * uptodate as a result of this (potentially partial) write.
1744 if (!partial)
1745 SetPageUptodate(page);
1746 return 0;
1750 * Generic "read page" function for block devices that have the normal
1751 * get_block functionality. This is most of the block device filesystems.
1752 * Reads the page asynchronously --- the unlock_buffer() and
1753 * mark_buffer_uptodate() functions propagate buffer state into the
1754 * page struct once IO has completed.
1756 int block_read_full_page(struct page *page, get_block_t *get_block)
1758 struct inode *inode = (struct inode*)page->mapping->host;
1759 unsigned long iblock, lblock;
1760 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1761 unsigned int blocksize, blocks;
1762 char *kaddr = NULL;
1763 int nr, i;
1765 if (!PageLocked(page))
1766 PAGE_BUG(page);
1767 blocksize = inode->i_sb->s_blocksize;
1768 if (!page->buffers)
1769 create_empty_buffers(page, inode, blocksize);
1770 head = page->buffers;
1772 blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1773 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1774 lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1775 bh = head;
1776 nr = 0;
1777 i = 0;
1779 do {
1780 if (buffer_uptodate(bh))
1781 continue;
1783 if (!buffer_mapped(bh)) {
1784 if (iblock < lblock) {
1785 if (get_block(inode, iblock, bh, 0))
1786 continue;
1788 if (!buffer_mapped(bh)) {
1789 if (!kaddr)
1790 kaddr = kmap(page);
1791 memset(kaddr + i*blocksize, 0, blocksize);
1792 flush_dcache_page(page);
1793 set_bit(BH_Uptodate, &bh->b_state);
1794 continue;
1798 init_buffer(bh, end_buffer_io_async, NULL);
1799 atomic_inc(&bh->b_count);
1800 arr[nr] = bh;
1801 nr++;
1802 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1804 if (nr) {
1805 if (Page_Uptodate(page))
1806 BUG();
1807 ll_rw_block(READ, nr, arr);
1808 } else {
1810 * all buffers are uptodate - we can set the page
1811 * uptodate as well.
1813 SetPageUptodate(page);
1814 UnlockPage(page);
1816 if (kaddr)
1817 kunmap(page);
1818 return 0;
1822 * For moronic filesystems that do not allow holes in file.
1823 * We may have to extend the file.
1826 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1828 struct address_space *mapping = page->mapping;
1829 struct inode *inode = (struct inode*)mapping->host;
1830 struct page *new_page;
1831 unsigned long pgpos;
1832 long status;
1833 unsigned zerofrom;
1834 unsigned blocksize = inode->i_sb->s_blocksize;
1835 char *kaddr;
1837 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1838 status = -ENOMEM;
1839 new_page = grab_cache_page(mapping, pgpos);
1840 if (!new_page)
1841 goto out;
1842 /* we might sleep */
1843 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1844 UnlockPage(new_page);
1845 page_cache_release(new_page);
1846 continue;
1848 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1849 if (zerofrom & (blocksize-1)) {
1850 *bytes |= (blocksize-1);
1851 (*bytes)++;
1853 status = __block_prepare_write(inode, new_page, zerofrom,
1854 PAGE_CACHE_SIZE, get_block);
1855 if (status)
1856 goto out_unmap;
1857 kaddr = page_address(new_page);
1858 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1859 flush_dcache_page(new_page);
1860 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1861 kunmap(new_page);
1862 UnlockPage(new_page);
1863 page_cache_release(new_page);
1866 if (page->index < pgpos) {
1867 /* completely inside the area */
1868 zerofrom = offset;
1869 } else {
1870 /* page covers the boundary, find the boundary offset */
1871 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1873 /* if we will expand the thing last block will be filled */
1874 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1875 *bytes |= (blocksize-1);
1876 (*bytes)++;
1879 /* starting below the boundary? Nothing to zero out */
1880 if (offset <= zerofrom)
1881 zerofrom = offset;
1883 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1884 if (status)
1885 goto out1;
1886 kaddr = page_address(page);
1887 if (zerofrom < offset) {
1888 memset(kaddr+zerofrom, 0, offset-zerofrom);
1889 flush_dcache_page(page);
1890 __block_commit_write(inode, page, zerofrom, offset);
1892 return 0;
1893 out1:
1894 ClearPageUptodate(page);
1895 kunmap(page);
1896 return status;
1898 out_unmap:
1899 ClearPageUptodate(new_page);
1900 kunmap(new_page);
1901 UnlockPage(new_page);
1902 page_cache_release(new_page);
1903 out:
1904 return status;
1907 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1908 get_block_t *get_block)
1910 struct inode *inode = (struct inode*)page->mapping->host;
1911 int err = __block_prepare_write(inode, page, from, to, get_block);
1912 if (err) {
1913 ClearPageUptodate(page);
1914 kunmap(page);
1916 return err;
1919 int generic_commit_write(struct file *file, struct page *page,
1920 unsigned from, unsigned to)
1922 struct inode *inode = (struct inode*)page->mapping->host;
1923 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1924 __block_commit_write(inode,page,from,to);
1925 kunmap(page);
1926 if (pos > inode->i_size) {
1927 inode->i_size = pos;
1928 mark_inode_dirty(inode);
1930 return 0;
1933 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
1935 unsigned long index = from >> PAGE_CACHE_SHIFT;
1936 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1937 unsigned blocksize, iblock, length, pos;
1938 struct inode *inode = (struct inode *)mapping->host;
1939 struct page *page;
1940 struct buffer_head *bh;
1941 int err;
1943 blocksize = inode->i_sb->s_blocksize;
1944 length = offset & (blocksize - 1);
1946 /* Block boundary? Nothing to do */
1947 if (!length)
1948 return 0;
1950 length = blocksize - length;
1951 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1953 page = grab_cache_page(mapping, index);
1954 err = PTR_ERR(page);
1955 if (IS_ERR(page))
1956 goto out;
1958 if (!page->buffers)
1959 create_empty_buffers(page, inode, blocksize);
1961 /* Find the buffer that contains "offset" */
1962 bh = page->buffers;
1963 pos = blocksize;
1964 while (offset >= pos) {
1965 bh = bh->b_this_page;
1966 iblock++;
1967 pos += blocksize;
1970 err = 0;
1971 if (!buffer_mapped(bh)) {
1972 /* Hole? Nothing to do */
1973 if (buffer_uptodate(bh))
1974 goto unlock;
1975 get_block(inode, iblock, bh, 0);
1976 /* Still unmapped? Nothing to do */
1977 if (!buffer_mapped(bh))
1978 goto unlock;
1981 /* Ok, it's mapped. Make sure it's up-to-date */
1982 if (Page_Uptodate(page))
1983 set_bit(BH_Uptodate, &bh->b_state);
1985 bh->b_end_io = end_buffer_io_sync;
1986 if (!buffer_uptodate(bh)) {
1987 err = -EIO;
1988 ll_rw_block(READ, 1, &bh);
1989 wait_on_buffer(bh);
1990 /* Uhhuh. Read error. Complain and punt. */
1991 if (!buffer_uptodate(bh))
1992 goto unlock;
1995 memset(kmap(page) + offset, 0, length);
1996 flush_dcache_page(page);
1997 kunmap(page);
1999 mark_buffer_dirty(bh);
2000 err = 0;
2002 unlock:
2003 UnlockPage(page);
2004 page_cache_release(page);
2005 out:
2006 return err;
2009 int block_write_full_page(struct page *page, get_block_t *get_block)
2011 struct inode *inode = (struct inode*)page->mapping->host;
2012 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2013 unsigned offset;
2014 int err;
2016 /* easy case */
2017 if (page->index < end_index)
2018 return __block_write_full_page(inode, page, get_block);
2020 /* things got complicated... */
2021 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2022 /* OK, are we completely out? */
2023 if (page->index >= end_index+1 || !offset)
2024 return -EIO;
2025 /* Sigh... will have to work, then... */
2026 err = __block_prepare_write(inode, page, 0, offset, get_block);
2027 if (!err) {
2028 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2029 flush_dcache_page(page);
2030 __block_commit_write(inode,page,0,offset);
2031 done:
2032 kunmap(page);
2033 return err;
2035 ClearPageUptodate(page);
2036 goto done;
2039 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2041 struct buffer_head tmp;
2042 struct inode *inode = (struct inode*)mapping->host;
2043 tmp.b_state = 0;
2044 tmp.b_blocknr = 0;
2045 get_block(inode, block, &tmp, 0);
2046 return tmp.b_blocknr;
2050 * IO completion routine for a buffer_head being used for kiobuf IO: we
2051 * can't dispatch the kiobuf callback until io_count reaches 0.
2054 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2056 struct kiobuf *kiobuf;
2058 mark_buffer_uptodate(bh, uptodate);
2060 kiobuf = bh->b_private;
2061 unlock_buffer(bh);
2062 end_kio_request(kiobuf, uptodate);
2067 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2068 * for them to complete. Clean up the buffer_heads afterwards.
2071 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2073 int iosize;
2074 int i;
2075 struct buffer_head *tmp;
2078 iosize = 0;
2079 spin_lock(&unused_list_lock);
2081 for (i = nr; --i >= 0; ) {
2082 iosize += size;
2083 tmp = bh[i];
2084 if (buffer_locked(tmp)) {
2085 spin_unlock(&unused_list_lock);
2086 wait_on_buffer(tmp);
2087 spin_lock(&unused_list_lock);
2090 if (!buffer_uptodate(tmp)) {
2091 /* We are traversing bh'es in reverse order so
2092 clearing iosize on error calculates the
2093 amount of IO before the first error. */
2094 iosize = 0;
2096 __put_unused_buffer_head(tmp);
2099 spin_unlock(&unused_list_lock);
2101 return iosize;
2105 * Start I/O on a physical range of kernel memory, defined by a vector
2106 * of kiobuf structs (much like a user-space iovec list).
2108 * The kiobuf must already be locked for IO. IO is submitted
2109 * asynchronously: you need to check page->locked, page->uptodate, and
2110 * maybe wait on page->wait.
2112 * It is up to the caller to make sure that there are enough blocks
2113 * passed in to completely map the iobufs to disk.
2116 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2117 kdev_t dev, unsigned long b[], int size)
2119 int err;
2120 int length;
2121 int transferred;
2122 int i;
2123 int bufind;
2124 int pageind;
2125 int bhind;
2126 int offset;
2127 int sectors = size>>9;
2128 unsigned long blocknr;
2129 struct kiobuf * iobuf = NULL;
2130 struct page * map;
2131 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
2133 if (!nr)
2134 return 0;
2137 * First, do some alignment and validity checks
2139 for (i = 0; i < nr; i++) {
2140 iobuf = iovec[i];
2141 if ((iobuf->offset & (size-1)) ||
2142 (iobuf->length & (size-1)))
2143 return -EINVAL;
2144 if (!iobuf->nr_pages)
2145 panic("brw_kiovec: iobuf not initialised");
2149 * OK to walk down the iovec doing page IO on each page we find.
2151 bufind = bhind = transferred = err = 0;
2152 for (i = 0; i < nr; i++) {
2153 iobuf = iovec[i];
2154 offset = iobuf->offset;
2155 length = iobuf->length;
2156 iobuf->errno = 0;
2158 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2159 map = iobuf->maplist[pageind];
2160 if (!map) {
2161 err = -EFAULT;
2162 goto error;
2165 while (length > 0) {
2166 blocknr = b[bufind++];
2167 tmp = get_unused_buffer_head(0);
2168 if (!tmp) {
2169 err = -ENOMEM;
2170 goto error;
2173 tmp->b_dev = B_FREE;
2174 tmp->b_size = size;
2175 set_bh_page(tmp, map, offset);
2176 tmp->b_this_page = tmp;
2178 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2179 tmp->b_rdev = tmp->b_dev = dev;
2180 tmp->b_blocknr = blocknr;
2181 tmp->b_rsector = blocknr*sectors;
2182 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2184 if (rw == WRITE) {
2185 set_bit(BH_Uptodate, &tmp->b_state);
2186 set_bit(BH_Dirty, &tmp->b_state);
2189 bh[bhind++] = tmp;
2190 length -= size;
2191 offset += size;
2193 atomic_inc(&iobuf->io_count);
2195 generic_make_request(rw, tmp);
2197 * Wait for IO if we have got too much
2199 if (bhind >= KIO_MAX_SECTORS) {
2200 err = wait_kio(rw, bhind, bh, size);
2201 if (err >= 0)
2202 transferred += err;
2203 else
2204 goto finished;
2205 bhind = 0;
2208 if (offset >= PAGE_SIZE) {
2209 offset = 0;
2210 break;
2212 } /* End of block loop */
2213 } /* End of page loop */
2214 } /* End of iovec loop */
2216 /* Is there any IO still left to submit? */
2217 if (bhind) {
2218 err = wait_kio(rw, bhind, bh, size);
2219 if (err >= 0)
2220 transferred += err;
2221 else
2222 goto finished;
2225 finished:
2226 if (transferred)
2227 return transferred;
2228 return err;
2230 error:
2231 /* We got an error allocating the bh'es. Just free the current
2232 buffer_heads and exit. */
2233 spin_lock(&unused_list_lock);
2234 for (i = bhind; --i >= 0; ) {
2235 __put_unused_buffer_head(bh[i]);
2237 spin_unlock(&unused_list_lock);
2238 goto finished;
2242 * Start I/O on a page.
2243 * This function expects the page to be locked and may return
2244 * before I/O is complete. You then have to check page->locked,
2245 * page->uptodate, and maybe wait on page->wait.
2247 * brw_page() is SMP-safe, although it's being called with the
2248 * kernel lock held - but the code is ready.
2250 * FIXME: we need a swapper_inode->get_block function to remove
2251 * some of the bmap kludges and interface ugliness here.
2253 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2255 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
2256 int nr, fresh /* temporary debugging flag */, block;
2258 if (!PageLocked(page))
2259 panic("brw_page: page not locked for I/O");
2260 // ClearPageError(page);
2262 * We pretty much rely on the page lock for this, because
2263 * create_page_buffers() might sleep.
2265 fresh = 0;
2266 if (!page->buffers) {
2267 create_page_buffers(rw, page, dev, b, size);
2268 fresh = 1;
2270 if (!page->buffers)
2271 BUG();
2273 head = page->buffers;
2274 bh = head;
2275 nr = 0;
2276 do {
2277 block = *(b++);
2279 if (fresh && (atomic_read(&bh->b_count) != 0))
2280 BUG();
2281 if (rw == READ) {
2282 if (!fresh)
2283 BUG();
2284 if (!buffer_uptodate(bh)) {
2285 arr[nr++] = bh;
2286 atomic_inc(&bh->b_count);
2288 } else { /* WRITE */
2289 if (!bh->b_blocknr) {
2290 if (!block)
2291 BUG();
2292 bh->b_blocknr = block;
2293 } else {
2294 if (!block)
2295 BUG();
2297 set_bit(BH_Uptodate, &bh->b_state);
2298 set_bit(BH_Dirty, &bh->b_state);
2299 arr[nr++] = bh;
2300 atomic_inc(&bh->b_count);
2302 bh = bh->b_this_page;
2303 } while (bh != head);
2304 if ((rw == READ) && nr) {
2305 if (Page_Uptodate(page))
2306 BUG();
2307 ll_rw_block(rw, nr, arr);
2308 } else {
2309 if (!nr && rw == READ) {
2310 SetPageUptodate(page);
2311 UnlockPage(page);
2313 if (nr && (rw == WRITE))
2314 ll_rw_block(rw, nr, arr);
2316 return 0;
2319 int block_symlink(struct inode *inode, const char *symname, int len)
2321 struct address_space *mapping = inode->i_mapping;
2322 struct page *page = grab_cache_page(mapping, 0);
2323 int err = -ENOMEM;
2324 char *kaddr;
2326 if (!page)
2327 goto fail;
2328 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2329 if (err)
2330 goto fail_map;
2331 kaddr = page_address(page);
2332 memcpy(kaddr, symname, len-1);
2333 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2335 * Notice that we are _not_ going to block here - end of page is
2336 * unmapped, so this will only try to map the rest of page, see
2337 * that it is unmapped (typically even will not look into inode -
2338 * ->i_size will be enough for everything) and zero it out.
2339 * OTOH it's obviously correct and should make the page up-to-date.
2341 err = mapping->a_ops->readpage(NULL, page);
2342 wait_on_page(page);
2343 page_cache_release(page);
2344 if (err < 0)
2345 goto fail;
2346 mark_inode_dirty(inode);
2347 return 0;
2348 fail_map:
2349 UnlockPage(page);
2350 page_cache_release(page);
2351 fail:
2352 return err;
2356 * Try to increase the number of buffers available: the size argument
2357 * is used to determine what kind of buffers we want.
2359 static int grow_buffers(int size)
2361 struct page * page;
2362 struct buffer_head *bh, *tmp;
2363 struct buffer_head * insert_point;
2364 int isize;
2366 if ((size & 511) || (size > PAGE_SIZE)) {
2367 printk("VFS: grow_buffers: size = %d\n",size);
2368 return 0;
2371 page = alloc_page(GFP_BUFFER);
2372 if (!page)
2373 goto out;
2374 LockPage(page);
2375 bh = create_buffers(page, size, 0);
2376 if (!bh)
2377 goto no_buffer_head;
2379 isize = BUFSIZE_INDEX(size);
2381 spin_lock(&free_list[isize].lock);
2382 insert_point = free_list[isize].list;
2383 tmp = bh;
2384 while (1) {
2385 if (insert_point) {
2386 tmp->b_next_free = insert_point->b_next_free;
2387 tmp->b_prev_free = insert_point;
2388 insert_point->b_next_free->b_prev_free = tmp;
2389 insert_point->b_next_free = tmp;
2390 } else {
2391 tmp->b_prev_free = tmp;
2392 tmp->b_next_free = tmp;
2394 insert_point = tmp;
2395 if (tmp->b_this_page)
2396 tmp = tmp->b_this_page;
2397 else
2398 break;
2400 tmp->b_this_page = bh;
2401 free_list[isize].list = bh;
2402 spin_unlock(&free_list[isize].lock);
2404 page->buffers = bh;
2405 page->flags &= ~(1 << PG_referenced);
2406 lru_cache_add(page);
2407 UnlockPage(page);
2408 atomic_inc(&buffermem_pages);
2409 return 1;
2411 no_buffer_head:
2412 UnlockPage(page);
2413 page_cache_release(page);
2414 out:
2415 return 0;
2419 * Sync all the buffers on one page..
2421 * If we have old buffers that are locked, we'll
2422 * wait on them, but we won't wait on the new ones
2423 * we're writing out now.
2425 * This all is required so that we can free up memory
2426 * later.
2428 * Wait:
2429 * 0 - no wait (this does not get called - see try_to_free_buffers below)
2430 * 1 - start IO for dirty buffers
2431 * 2 - wait for completion of locked buffers
2433 static void sync_page_buffers(struct buffer_head *bh, int wait)
2435 struct buffer_head * tmp = bh;
2437 do {
2438 struct buffer_head *p = tmp;
2439 tmp = tmp->b_this_page;
2440 if (buffer_locked(p)) {
2441 if (wait > 1)
2442 __wait_on_buffer(p);
2443 } else if (buffer_dirty(p))
2444 ll_rw_block(WRITE, 1, &p);
2445 } while (tmp != bh);
2449 * Can the buffer be thrown out?
2451 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2452 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2455 * try_to_free_buffers() checks if all the buffers on this particular page
2456 * are unused, and free's the page if so.
2458 * Wake up bdflush() if this fails - if we're running low on memory due
2459 * to dirty buffers, we need to flush them out as quickly as possible.
2461 * NOTE: There are quite a number of ways that threads of control can
2462 * obtain a reference to a buffer head within a page. So we must
2463 * lock out all of these paths to cleanly toss the page.
2465 int try_to_free_buffers(struct page * page, int wait)
2467 struct buffer_head * tmp, * bh = page->buffers;
2468 int index = BUFSIZE_INDEX(bh->b_size);
2469 int loop = 0;
2471 cleaned_buffers_try_again:
2472 spin_lock(&lru_list_lock);
2473 write_lock(&hash_table_lock);
2474 spin_lock(&free_list[index].lock);
2475 tmp = bh;
2476 do {
2477 struct buffer_head *p = tmp;
2479 tmp = tmp->b_this_page;
2480 if (buffer_busy(p))
2481 goto busy_buffer_page;
2482 } while (tmp != bh);
2484 spin_lock(&unused_list_lock);
2485 tmp = bh;
2486 do {
2487 struct buffer_head * p = tmp;
2488 tmp = tmp->b_this_page;
2490 /* The buffer can be either on the regular
2491 * queues or on the free list..
2493 if (p->b_dev != B_FREE) {
2494 remove_inode_queue(p);
2495 __remove_from_queues(p);
2496 } else
2497 __remove_from_free_list(p, index);
2498 __put_unused_buffer_head(p);
2499 } while (tmp != bh);
2500 spin_unlock(&unused_list_lock);
2502 /* Wake up anyone waiting for buffer heads */
2503 wake_up(&buffer_wait);
2505 /* And free the page */
2506 page->buffers = NULL;
2507 page_cache_release(page);
2508 spin_unlock(&free_list[index].lock);
2509 write_unlock(&hash_table_lock);
2510 spin_unlock(&lru_list_lock);
2511 return 1;
2513 busy_buffer_page:
2514 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2515 spin_unlock(&free_list[index].lock);
2516 write_unlock(&hash_table_lock);
2517 spin_unlock(&lru_list_lock);
2518 if (wait) {
2519 sync_page_buffers(bh, wait);
2520 /* We waited synchronously, so we can free the buffers. */
2521 if (wait > 1 && !loop) {
2522 loop = 1;
2523 goto cleaned_buffers_try_again;
2526 return 0;
2529 /* ================== Debugging =================== */
2531 void show_buffers(void)
2533 #ifdef CONFIG_SMP
2534 struct buffer_head * bh;
2535 int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2536 int protected = 0;
2537 int nlist;
2538 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2539 #endif
2541 printk("Buffer memory: %6dkB\n",
2542 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2544 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2545 if (!spin_trylock(&lru_list_lock))
2546 return;
2547 for(nlist = 0; nlist < NR_LIST; nlist++) {
2548 found = locked = dirty = used = lastused = protected = 0;
2549 bh = lru_list[nlist];
2550 if(!bh) continue;
2552 do {
2553 found++;
2554 if (buffer_locked(bh))
2555 locked++;
2556 if (buffer_protected(bh))
2557 protected++;
2558 if (buffer_dirty(bh))
2559 dirty++;
2560 if (atomic_read(&bh->b_count))
2561 used++, lastused = found;
2562 bh = bh->b_next_free;
2563 } while (bh != lru_list[nlist]);
2565 int tmp = nr_buffers_type[nlist];
2566 if (found != tmp)
2567 printk("%9s: BUG -> found %d, reported %d\n",
2568 buf_types[nlist], found, tmp);
2570 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2571 "%d locked, %d protected, %d dirty\n",
2572 buf_types[nlist], found, size_buffers_type[nlist]>>10,
2573 used, lastused, locked, protected, dirty);
2575 spin_unlock(&lru_list_lock);
2576 #endif
2579 /* ===================== Init ======================= */
2582 * allocate the hash table and init the free list
2583 * Use gfp() for the hash table to decrease TLB misses, use
2584 * SLAB cache for buffer heads.
2586 void __init buffer_init(unsigned long mempages)
2588 int order, i;
2589 unsigned int nr_hash;
2591 /* The buffer cache hash table is less important these days,
2592 * trim it a bit.
2594 mempages >>= 14;
2596 mempages *= sizeof(struct buffer_head *);
2598 for (order = 0; (1 << order) < mempages; order++)
2601 /* try to allocate something until we get it or we're asking
2602 for something that is really too small */
2604 do {
2605 unsigned long tmp;
2607 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2608 bh_hash_mask = (nr_hash - 1);
2610 tmp = nr_hash;
2611 bh_hash_shift = 0;
2612 while((tmp >>= 1UL) != 0UL)
2613 bh_hash_shift++;
2615 hash_table = (struct buffer_head **)
2616 __get_free_pages(GFP_ATOMIC, order);
2617 } while (hash_table == NULL && --order > 0);
2618 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2619 nr_hash, order, (PAGE_SIZE << order));
2621 if (!hash_table)
2622 panic("Failed to allocate buffer hash table\n");
2624 /* Setup hash chains. */
2625 for(i = 0; i < nr_hash; i++)
2626 hash_table[i] = NULL;
2628 /* Setup free lists. */
2629 for(i = 0; i < NR_SIZES; i++) {
2630 free_list[i].list = NULL;
2631 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2634 /* Setup lru lists. */
2635 for(i = 0; i < NR_LIST; i++)
2636 lru_list[i] = NULL;
2641 /* ====================== bdflush support =================== */
2643 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2644 * response to dirty buffers. Once this process is activated, we write back
2645 * a limited number of buffers to the disks and then go back to sleep again.
2647 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2648 struct task_struct *bdflush_tsk = 0;
2650 void wakeup_bdflush(int block)
2652 DECLARE_WAITQUEUE(wait, current);
2654 if (current == bdflush_tsk)
2655 return;
2657 if (!block) {
2658 wake_up_process(bdflush_tsk);
2659 return;
2662 /* bdflush can wakeup us before we have a chance to
2663 go to sleep so we must be smart in handling
2664 this wakeup event from bdflush to avoid deadlocking in SMP
2665 (we are not holding any lock anymore in these two paths). */
2666 __set_current_state(TASK_UNINTERRUPTIBLE);
2667 add_wait_queue(&bdflush_done, &wait);
2669 wake_up_process(bdflush_tsk);
2670 schedule();
2672 remove_wait_queue(&bdflush_done, &wait);
2673 __set_current_state(TASK_RUNNING);
2676 /* This is the _only_ function that deals with flushing async writes
2677 to disk.
2678 NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2679 as all dirty buffers lives _only_ in the DIRTY lru list.
2680 As we never browse the LOCKED and CLEAN lru lists they are infact
2681 completly useless. */
2682 static int flush_dirty_buffers(int check_flushtime)
2684 struct buffer_head * bh, *next;
2685 int flushed = 0, i;
2687 restart:
2688 spin_lock(&lru_list_lock);
2689 bh = lru_list[BUF_DIRTY];
2690 if (!bh)
2691 goto out_unlock;
2692 for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2693 next = bh->b_next_free;
2695 if (!buffer_dirty(bh)) {
2696 __refile_buffer(bh);
2697 continue;
2699 if (buffer_locked(bh))
2700 continue;
2702 if (check_flushtime) {
2703 /* The dirty lru list is chronologically ordered so
2704 if the current bh is not yet timed out,
2705 then also all the following bhs
2706 will be too young. */
2707 if (time_before(jiffies, bh->b_flushtime))
2708 goto out_unlock;
2709 } else {
2710 if (++flushed > bdf_prm.b_un.ndirty)
2711 goto out_unlock;
2714 /* OK, now we are committed to write it out. */
2715 atomic_inc(&bh->b_count);
2716 spin_unlock(&lru_list_lock);
2717 ll_rw_block(WRITE, 1, &bh);
2718 atomic_dec(&bh->b_count);
2720 if (current->need_resched)
2721 schedule();
2722 goto restart;
2724 out_unlock:
2725 spin_unlock(&lru_list_lock);
2727 return flushed;
2731 * Here we attempt to write back old buffers. We also try to flush inodes
2732 * and supers as well, since this function is essentially "update", and
2733 * otherwise there would be no way of ensuring that these quantities ever
2734 * get written back. Ideally, we would have a timestamp on the inodes
2735 * and superblocks so that we could write back only the old ones as well
2738 static int sync_old_buffers(void)
2740 lock_kernel();
2741 sync_supers(0);
2742 sync_inodes(0);
2743 unlock_kernel();
2745 flush_dirty_buffers(1);
2746 /* must really sync all the active I/O request to disk here */
2747 run_task_queue(&tq_disk);
2748 return 0;
2751 int block_sync_page(struct page *page)
2753 run_task_queue(&tq_disk);
2754 return 0;
2757 /* This is the interface to bdflush. As we get more sophisticated, we can
2758 * pass tuning parameters to this "process", to adjust how it behaves.
2759 * We would want to verify each parameter, however, to make sure that it
2760 * is reasonable. */
2762 asmlinkage long sys_bdflush(int func, long data)
2764 if (!capable(CAP_SYS_ADMIN))
2765 return -EPERM;
2767 if (func == 1) {
2768 /* do_exit directly and let kupdate to do its work alone. */
2769 do_exit(0);
2770 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2771 a syscall that doesn't care about the current mm context. */
2772 int error;
2773 struct mm_struct *user_mm;
2776 * bdflush will spend all of it's time in kernel-space,
2777 * without touching user-space, so we can switch it into
2778 * 'lazy TLB mode' to reduce the cost of context-switches
2779 * to and from bdflush.
2781 user_mm = start_lazy_tlb();
2782 error = sync_old_buffers();
2783 end_lazy_tlb(user_mm);
2784 return error;
2785 #endif
2788 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2789 if (func >= 2) {
2790 int i = (func-2) >> 1;
2791 if (i >= 0 && i < N_PARAM) {
2792 if ((func & 1) == 0)
2793 return put_user(bdf_prm.data[i], (int*)data);
2795 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2796 bdf_prm.data[i] = data;
2797 return 0;
2800 return -EINVAL;
2803 /* Having func 0 used to launch the actual bdflush and then never
2804 * return (unless explicitly killed). We return zero here to
2805 * remain semi-compatible with present update(8) programs.
2807 return 0;
2811 * This is the actual bdflush daemon itself. It used to be started from
2812 * the syscall above, but now we launch it ourselves internally with
2813 * kernel_thread(...) directly after the first thread in init/main.c
2815 int bdflush(void *sem)
2817 struct task_struct *tsk = current;
2818 int flushed;
2820 * We have a bare-bones task_struct, and really should fill
2821 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2822 * display semi-sane things. Not real crucial though...
2825 tsk->session = 1;
2826 tsk->pgrp = 1;
2827 strcpy(tsk->comm, "bdflush");
2828 bdflush_tsk = tsk;
2830 /* avoid getting signals */
2831 spin_lock_irq(&tsk->sigmask_lock);
2832 flush_signals(tsk);
2833 sigfillset(&tsk->blocked);
2834 recalc_sigpending(tsk);
2835 spin_unlock_irq(&tsk->sigmask_lock);
2837 up((struct semaphore *)sem);
2839 for (;;) {
2840 CHECK_EMERGENCY_SYNC
2842 flushed = flush_dirty_buffers(0);
2843 if (free_shortage())
2844 flushed += page_launder(GFP_BUFFER, 0);
2846 /* If wakeup_bdflush will wakeup us
2847 after our bdflush_done wakeup, then
2848 we must make sure to not sleep
2849 in schedule_timeout otherwise
2850 wakeup_bdflush may wait for our
2851 bdflush_done wakeup that would never arrive
2852 (as we would be sleeping) and so it would
2853 deadlock in SMP. */
2854 __set_current_state(TASK_INTERRUPTIBLE);
2855 wake_up_all(&bdflush_done);
2857 * If there are still a lot of dirty buffers around,
2858 * skip the sleep and flush some more. Otherwise, we
2859 * go to sleep waiting a wakeup.
2861 if (!flushed || balance_dirty_state(NODEV) < 0) {
2862 run_task_queue(&tq_disk);
2863 schedule();
2865 /* Remember to mark us as running otherwise
2866 the next schedule will block. */
2867 __set_current_state(TASK_RUNNING);
2872 * This is the kernel update daemon. It was used to live in userspace
2873 * but since it's need to run safely we want it unkillable by mistake.
2874 * You don't need to change your userspace configuration since
2875 * the userspace `update` will do_exit(0) at the first sys_bdflush().
2877 int kupdate(void *sem)
2879 struct task_struct * tsk = current;
2880 int interval;
2882 tsk->session = 1;
2883 tsk->pgrp = 1;
2884 strcpy(tsk->comm, "kupdate");
2886 /* sigstop and sigcont will stop and wakeup kupdate */
2887 spin_lock_irq(&tsk->sigmask_lock);
2888 sigfillset(&tsk->blocked);
2889 siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2890 recalc_sigpending(tsk);
2891 spin_unlock_irq(&tsk->sigmask_lock);
2893 up((struct semaphore *)sem);
2895 for (;;) {
2896 /* update interval */
2897 interval = bdf_prm.b_un.interval;
2898 if (interval) {
2899 tsk->state = TASK_INTERRUPTIBLE;
2900 schedule_timeout(interval);
2901 } else {
2902 stop_kupdate:
2903 tsk->state = TASK_STOPPED;
2904 schedule(); /* wait for SIGCONT */
2906 /* check for sigstop */
2907 if (signal_pending(tsk)) {
2908 int stopped = 0;
2909 spin_lock_irq(&tsk->sigmask_lock);
2910 if (sigismember(&tsk->pending.signal, SIGSTOP)) {
2911 sigdelset(&tsk->pending.signal, SIGSTOP);
2912 stopped = 1;
2914 recalc_sigpending(tsk);
2915 spin_unlock_irq(&tsk->sigmask_lock);
2916 if (stopped)
2917 goto stop_kupdate;
2919 #ifdef DEBUG
2920 printk("kupdate() activated...\n");
2921 #endif
2922 sync_old_buffers();
2926 static int __init bdflush_init(void)
2928 DECLARE_MUTEX_LOCKED(sem);
2929 kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2930 down(&sem);
2931 kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
2932 down(&sem);
2933 return 0;
2936 module_init(bdflush_init)