Import 2.3.12pre9
[davej-history.git] / fs / buffer.c
blobf869c280e30cf7b10f1a21ed0762de3be97bc351
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 #include <linux/sched.h>
30 #include <linux/fs.h>
31 #include <linux/malloc.h>
32 #include <linux/locks.h>
33 #include <linux/errno.h>
34 #include <linux/swap.h>
35 #include <linux/swapctl.h>
36 #include <linux/smp_lock.h>
37 #include <linux/vmalloc.h>
38 #include <linux/blkdev.h>
39 #include <linux/sysrq.h>
40 #include <linux/file.h>
41 #include <linux/init.h>
42 #include <linux/quotaops.h>
43 #include <linux/iobuf.h>
45 #include <asm/uaccess.h>
46 #include <asm/io.h>
47 #include <asm/bitops.h>
48 #include <asm/mmu_context.h>
50 #define NR_SIZES 7
51 static char buffersize_index[65] =
52 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
53 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
54 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
55 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
56 6};
58 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
59 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
60 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
61 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
62 number of unused buffer heads */
64 /* Anti-deadlock ordering:
65 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
69 * Hash table gook..
71 static unsigned int bh_hash_mask = 0;
72 static unsigned int bh_hash_shift = 0;
73 static struct buffer_head **hash_table;
74 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
76 static struct buffer_head *lru_list[NR_LIST];
77 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
78 static int nr_buffers_type[NR_LIST] = {0,};
80 static struct buffer_head * unused_list = NULL;
81 static int nr_unused_buffer_heads = 0;
82 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
83 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
85 struct bh_free_head {
86 struct buffer_head *list;
87 spinlock_t lock;
89 static struct bh_free_head free_list[NR_SIZES];
91 static kmem_cache_t *bh_cachep;
93 static int grow_buffers(int size);
95 /* This is used by some architectures to estimate available memory. */
96 atomic_t buffermem = ATOMIC_INIT(0);
98 /* Here is the parameter block for the bdflush process. If you add or
99 * remove any of the parameters, make sure to update kernel/sysctl.c.
102 #define N_PARAM 9
104 /* The dummy values in this structure are left in there for compatibility
105 * with old programs that play with the /proc entries.
107 union bdflush_param {
108 struct {
109 int nfract; /* Percentage of buffer cache dirty to
110 activate bdflush */
111 int ndirty; /* Maximum number of dirty blocks to write out per
112 wake-cycle */
113 int nrefill; /* Number of clean buffers to try to obtain
114 each time we call refill */
115 int nref_dirt; /* Dirty buffer threshold for activating bdflush
116 when trying to refill buffers. */
117 int dummy1; /* unused */
118 int age_buffer; /* Time for normal buffer to age before we flush it */
119 int age_super; /* Time for superblock to age before we flush it */
120 int dummy2; /* unused */
121 int dummy3; /* unused */
122 } b_un;
123 unsigned int data[N_PARAM];
124 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
126 /* These are the min and max parameter values that we will allow to be assigned */
127 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
128 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
130 void wakeup_bdflush(int);
133 * Rewrote the wait-routines to use the "new" wait-queue functionality,
134 * and getting rid of the cli-sti pairs. The wait-queue routines still
135 * need cli-sti, but now it's just a couple of 386 instructions or so.
137 * Note that the real wait_on_buffer() is an inline function that checks
138 * if 'b_wait' is set before calling this, so that the queues aren't set
139 * up unnecessarily.
141 void __wait_on_buffer(struct buffer_head * bh)
143 struct task_struct *tsk = current;
144 DECLARE_WAITQUEUE(wait, tsk);
146 atomic_inc(&bh->b_count);
147 add_wait_queue(&bh->b_wait, &wait);
148 repeat:
149 tsk->state = TASK_UNINTERRUPTIBLE;
150 run_task_queue(&tq_disk);
151 if (buffer_locked(bh)) {
152 schedule();
153 goto repeat;
155 tsk->state = TASK_RUNNING;
156 remove_wait_queue(&bh->b_wait, &wait);
157 atomic_dec(&bh->b_count);
160 /* Call sync_buffers with wait!=0 to ensure that the call does not
161 * return until all buffer writes have completed. Sync() may return
162 * before the writes have finished; fsync() may not.
165 /* Godamity-damn. Some buffers (bitmaps for filesystems)
166 * spontaneously dirty themselves without ever brelse being called.
167 * We will ultimately want to put these in a separate list, but for
168 * now we search all of the lists for dirty buffers.
170 static int sync_buffers(kdev_t dev, int wait)
172 int i, retry, pass = 0, err = 0;
173 struct buffer_head * bh, *next;
175 /* One pass for no-wait, three for wait:
176 * 0) write out all dirty, unlocked buffers;
177 * 1) write out all dirty buffers, waiting if locked;
178 * 2) wait for completion by waiting for all buffers to unlock.
180 do {
181 retry = 0;
183 /* We search all lists as a failsafe mechanism, not because we expect
184 * there to be dirty buffers on any of the other lists.
186 repeat:
187 spin_lock(&lru_list_lock);
188 bh = lru_list[BUF_DIRTY];
189 if (!bh)
190 goto repeat2;
192 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
193 next = bh->b_next_free;
195 if (!lru_list[BUF_DIRTY])
196 break;
197 if (dev && bh->b_dev != dev)
198 continue;
199 if (buffer_locked(bh)) {
200 /* Buffer is locked; skip it unless wait is
201 * requested AND pass > 0.
203 if (!wait || !pass) {
204 retry = 1;
205 continue;
207 atomic_inc(&bh->b_count);
208 spin_unlock(&lru_list_lock);
209 wait_on_buffer (bh);
210 atomic_dec(&bh->b_count);
211 goto repeat;
214 /* If an unlocked buffer is not uptodate, there has
215 * been an IO error. Skip it.
217 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
218 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
219 err = -EIO;
220 continue;
223 /* Don't write clean buffers. Don't write ANY buffers
224 * on the third pass.
226 if (!buffer_dirty(bh) || pass >= 2)
227 continue;
229 atomic_inc(&bh->b_count);
230 bh->b_flushtime = 0;
231 spin_unlock(&lru_list_lock);
232 ll_rw_block(WRITE, 1, &bh);
233 atomic_dec(&bh->b_count);
234 retry = 1;
235 goto repeat;
238 repeat2:
239 bh = lru_list[BUF_LOCKED];
240 if (!bh) {
241 spin_unlock(&lru_list_lock);
242 break;
244 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
245 next = bh->b_next_free;
247 if (!lru_list[BUF_LOCKED])
248 break;
249 if (dev && bh->b_dev != dev)
250 continue;
251 if (buffer_locked(bh)) {
252 /* Buffer is locked; skip it unless wait is
253 * requested AND pass > 0.
255 if (!wait || !pass) {
256 retry = 1;
257 continue;
259 atomic_inc(&bh->b_count);
260 spin_unlock(&lru_list_lock);
261 wait_on_buffer (bh);
262 spin_lock(&lru_list_lock);
263 atomic_dec(&bh->b_count);
264 goto repeat2;
267 spin_unlock(&lru_list_lock);
269 /* If we are waiting for the sync to succeed, and if any dirty
270 * blocks were written, then repeat; on the second pass, only
271 * wait for buffers being written (do not pass to write any
272 * more buffers on the second pass).
274 } while (wait && retry && ++pass<=2);
275 return err;
278 void sync_dev(kdev_t dev)
280 sync_buffers(dev, 0);
281 sync_supers(dev);
282 sync_inodes(dev);
283 sync_buffers(dev, 0);
284 DQUOT_SYNC(dev);
286 * FIXME(eric) we need to sync the physical devices here.
287 * This is because some (scsi) controllers have huge amounts of
288 * cache onboard (hundreds of Mb), and we need to instruct
289 * them to commit all of the dirty memory to disk, and we should
290 * not return until this has happened.
292 * This would need to get implemented by going through the assorted
293 * layers so that each block major number can be synced, and this
294 * would call down into the upper and mid-layer scsi.
298 int fsync_dev(kdev_t dev)
300 sync_buffers(dev, 0);
302 lock_kernel();
303 sync_supers(dev);
304 sync_inodes(dev);
305 DQUOT_SYNC(dev);
306 unlock_kernel();
308 return sync_buffers(dev, 1);
311 asmlinkage int sys_sync(void)
313 fsync_dev(0);
314 return 0;
318 * filp may be NULL if called via the msync of a vma.
321 int file_fsync(struct file *filp, struct dentry *dentry)
323 struct inode * inode = dentry->d_inode;
324 struct super_block * sb;
325 kdev_t dev;
327 /* sync the inode to buffers */
328 write_inode_now(inode);
330 /* sync the superblock to buffers */
331 sb = inode->i_sb;
332 wait_on_super(sb);
333 if (sb->s_op && sb->s_op->write_super)
334 sb->s_op->write_super(sb);
336 /* .. finally sync the buffers to disk */
337 dev = inode->i_dev;
338 return sync_buffers(dev, 1);
341 asmlinkage int sys_fsync(unsigned int fd)
343 struct file * file;
344 struct dentry * dentry;
345 struct inode * inode;
346 int err;
348 lock_kernel();
349 err = -EBADF;
350 file = fget(fd);
351 if (!file)
352 goto out;
354 dentry = file->f_dentry;
355 if (!dentry)
356 goto out_putf;
358 inode = dentry->d_inode;
359 if (!inode)
360 goto out_putf;
362 err = -EINVAL;
363 if (!file->f_op || !file->f_op->fsync)
364 goto out_putf;
366 /* We need to protect against concurrent writers.. */
367 down(&inode->i_sem);
368 err = file->f_op->fsync(file, dentry);
369 up(&inode->i_sem);
371 out_putf:
372 fput(file);
373 out:
374 unlock_kernel();
375 return err;
378 asmlinkage int sys_fdatasync(unsigned int fd)
380 struct file * file;
381 struct dentry * dentry;
382 struct inode * inode;
383 int err;
385 lock_kernel();
386 err = -EBADF;
387 file = fget(fd);
388 if (!file)
389 goto out;
391 dentry = file->f_dentry;
392 if (!dentry)
393 goto out_putf;
395 inode = dentry->d_inode;
396 if (!inode)
397 goto out_putf;
399 err = -EINVAL;
400 if (!file->f_op || !file->f_op->fsync)
401 goto out_putf;
403 /* this needs further work, at the moment it is identical to fsync() */
404 down(&inode->i_sem);
405 err = file->f_op->fsync(file, dentry);
406 up(&inode->i_sem);
408 out_putf:
409 fput(file);
410 out:
411 unlock_kernel();
412 return err;
415 void invalidate_buffers(kdev_t dev)
417 int nlist;
419 spin_lock(&lru_list_lock);
420 for(nlist = 0; nlist < NR_LIST; nlist++) {
421 struct buffer_head * bh;
422 int i;
423 retry:
424 bh = lru_list[nlist];
425 if (!bh)
426 continue;
427 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
428 if (bh->b_dev != dev)
429 continue;
430 if (buffer_locked(bh)) {
431 atomic_inc(&bh->b_count);
432 spin_unlock(&lru_list_lock);
433 wait_on_buffer(bh);
434 spin_lock(&lru_list_lock);
435 atomic_dec(&bh->b_count);
436 goto retry;
438 if (atomic_read(&bh->b_count))
439 continue;
440 bh->b_flushtime = 0;
441 clear_bit(BH_Protected, &bh->b_state);
442 clear_bit(BH_Uptodate, &bh->b_state);
443 clear_bit(BH_Dirty, &bh->b_state);
444 clear_bit(BH_Req, &bh->b_state);
447 spin_unlock(&lru_list_lock);
450 /* After several hours of tedious analysis, the following hash
451 * function won. Do not mess with it... -DaveM
453 #define _hashfn(dev,block) \
454 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
455 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
456 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
458 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
460 if ((bh->b_next = *head) != NULL)
461 bh->b_next->b_pprev = &bh->b_next;
462 *head = bh;
463 bh->b_pprev = head;
466 static __inline__ void __hash_unlink(struct buffer_head *bh)
468 if (bh->b_next)
469 bh->b_next->b_pprev = bh->b_pprev;
470 *(bh->b_pprev) = bh->b_next;
471 bh->b_pprev = NULL;
474 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
476 struct buffer_head **bhp = &lru_list[blist];
478 if(!*bhp) {
479 *bhp = bh;
480 bh->b_prev_free = bh;
482 bh->b_next_free = *bhp;
483 bh->b_prev_free = (*bhp)->b_prev_free;
484 (*bhp)->b_prev_free->b_next_free = bh;
485 (*bhp)->b_prev_free = bh;
486 nr_buffers_type[blist]++;
489 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
491 if (bh->b_prev_free || bh->b_next_free) {
492 bh->b_prev_free->b_next_free = bh->b_next_free;
493 bh->b_next_free->b_prev_free = bh->b_prev_free;
494 if (lru_list[blist] == bh)
495 lru_list[blist] = bh->b_next_free;
496 if (lru_list[blist] == bh)
497 lru_list[blist] = NULL;
498 bh->b_next_free = bh->b_prev_free = NULL;
499 nr_buffers_type[blist]--;
503 static void __remove_from_free_list(struct buffer_head * bh, int index)
505 if(bh->b_next_free == bh)
506 free_list[index].list = NULL;
507 else {
508 bh->b_prev_free->b_next_free = bh->b_next_free;
509 bh->b_next_free->b_prev_free = bh->b_prev_free;
510 if (free_list[index].list == bh)
511 free_list[index].list = bh->b_next_free;
513 bh->b_next_free = bh->b_prev_free = NULL;
516 /* The following two functions must operate atomically
517 * because they control the visibility of a buffer head
518 * to the rest of the kernel.
520 static __inline__ void __remove_from_queues(struct buffer_head *bh)
522 write_lock(&hash_table_lock);
523 if (bh->b_pprev)
524 __hash_unlink(bh);
525 __remove_from_lru_list(bh, bh->b_list);
526 write_unlock(&hash_table_lock);
529 static void insert_into_queues(struct buffer_head *bh)
531 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
533 spin_lock(&lru_list_lock);
534 write_lock(&hash_table_lock);
535 __hash_link(bh, head);
536 __insert_into_lru_list(bh, bh->b_list);
537 write_unlock(&hash_table_lock);
538 spin_unlock(&lru_list_lock);
541 /* This function must only run if there are no other
542 * references _anywhere_ to this buffer head.
544 static void put_last_free(struct buffer_head * bh)
546 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
547 struct buffer_head **bhp = &head->list;
549 spin_lock(&head->lock);
550 bh->b_dev = B_FREE;
551 if(!*bhp) {
552 *bhp = bh;
553 bh->b_prev_free = bh;
555 bh->b_next_free = *bhp;
556 bh->b_prev_free = (*bhp)->b_prev_free;
557 (*bhp)->b_prev_free->b_next_free = bh;
558 (*bhp)->b_prev_free = bh;
559 spin_unlock(&head->lock);
563 * Why like this, I hear you say... The reason is race-conditions.
564 * As we don't lock buffers (unless we are reading them, that is),
565 * something might happen to it while we sleep (ie a read-error
566 * will force it bad). This shouldn't really happen currently, but
567 * the code is ready.
569 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
571 struct buffer_head **head = &hash(dev, block);
572 struct buffer_head *bh;
574 read_lock(&hash_table_lock);
575 for(bh = *head; bh; bh = bh->b_next)
576 if (bh->b_blocknr == block &&
577 bh->b_size == size &&
578 bh->b_dev == dev)
579 break;
580 if (bh)
581 atomic_inc(&bh->b_count);
582 read_unlock(&hash_table_lock);
584 return bh;
587 unsigned int get_hardblocksize(kdev_t dev)
590 * Get the hard sector size for the given device. If we don't know
591 * what it is, return 0.
593 if (hardsect_size[MAJOR(dev)] != NULL) {
594 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
595 if (blksize != 0)
596 return blksize;
600 * We don't know what the hardware sector size for this device is.
601 * Return 0 indicating that we don't know.
603 return 0;
606 void set_blocksize(kdev_t dev, int size)
608 extern int *blksize_size[];
609 int i, nlist;
610 struct buffer_head * bh, *bhnext;
612 if (!blksize_size[MAJOR(dev)])
613 return;
615 /* Size must be a power of two, and between 512 and PAGE_SIZE */
616 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
617 panic("Invalid blocksize passed to set_blocksize");
619 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
620 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
621 return;
623 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
624 return;
625 sync_buffers(dev, 2);
626 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
628 /* We need to be quite careful how we do this - we are moving entries
629 * around on the free list, and we can get in a loop if we are not careful.
631 for(nlist = 0; nlist < NR_LIST; nlist++) {
632 repeat:
633 spin_lock(&lru_list_lock);
634 bh = lru_list[nlist];
635 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
636 if(!bh)
637 break;
639 bhnext = bh->b_next_free;
640 if (bh->b_dev != dev)
641 continue;
642 if (bh->b_size == size)
643 continue;
644 if (buffer_locked(bh)) {
645 atomic_inc(&bh->b_count);
646 spin_unlock(&lru_list_lock);
647 wait_on_buffer(bh);
648 atomic_dec(&bh->b_count);
649 goto repeat;
651 if (bh->b_dev == dev && bh->b_size != size) {
652 clear_bit(BH_Dirty, &bh->b_state);
653 clear_bit(BH_Uptodate, &bh->b_state);
654 clear_bit(BH_Req, &bh->b_state);
655 bh->b_flushtime = 0;
657 if (atomic_read(&bh->b_count) == 0) {
658 __remove_from_queues(bh);
659 put_last_free(bh);
662 spin_unlock(&lru_list_lock);
667 * We used to try various strange things. Let's not.
669 static void refill_freelist(int size)
671 if (!grow_buffers(size)) {
672 wakeup_bdflush(1);
673 current->policy |= SCHED_YIELD;
674 schedule();
678 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
680 bh->b_list = BUF_CLEAN;
681 bh->b_flushtime = 0;
682 bh->b_end_io = handler;
683 bh->b_dev_id = dev_id;
686 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
688 mark_buffer_uptodate(bh, uptodate);
689 unlock_buffer(bh);
692 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
694 mark_buffer_uptodate(bh, uptodate);
695 unlock_buffer(bh);
696 BUG();
699 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
701 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
702 unsigned long flags;
703 struct buffer_head *tmp;
704 struct page *page;
705 int free;
707 mark_buffer_uptodate(bh, uptodate);
709 /* This is a temporary buffer used for page I/O. */
710 page = mem_map + MAP_NR(bh->b_data);
712 if (!uptodate)
713 SetPageError(page);
716 * Be _very_ careful from here on. Bad things can happen if
717 * two buffer heads end IO at almost the same time and both
718 * decide that the page is now completely done.
720 * Async buffer_heads are here only as labels for IO, and get
721 * thrown away once the IO for this page is complete. IO is
722 * deemed complete once all buffers have been visited
723 * (b_count==0) and are now unlocked. We must make sure that
724 * only the _last_ buffer that decrements its count is the one
725 * that free's the page..
727 spin_lock_irqsave(&page_uptodate_lock, flags);
728 unlock_buffer(bh);
729 atomic_dec(&bh->b_count);
730 tmp = bh->b_this_page;
731 while (tmp != bh) {
732 if (atomic_read(&tmp->b_count) &&
733 (tmp->b_end_io == end_buffer_io_async))
734 goto still_busy;
735 tmp = tmp->b_this_page;
738 /* OK, the async IO on this page is complete. */
739 spin_unlock_irqrestore(&page_uptodate_lock, flags);
742 * if none of the buffers had errors then we can set the
743 * page uptodate:
745 if (!PageError(page))
746 SetPageUptodate(page);
749 * Run the hooks that have to be done when a page I/O has completed.
751 * Note - we need to test the flags before we unlock the page, but
752 * we must not actually free the page until after the unlock!
754 if (test_and_clear_bit(PG_decr_after, &page->flags))
755 atomic_dec(&nr_async_pages);
757 if (test_and_clear_bit(PG_free_swap_after, &page->flags))
758 swap_free(page->offset);
760 free = test_and_clear_bit(PG_free_after, &page->flags);
762 if (page->owner != (void *)-1)
763 PAGE_BUG(page);
764 page->owner = current;
765 UnlockPage(page);
767 if (free)
768 __free_page(page);
770 return;
772 still_busy:
773 spin_unlock_irqrestore(&page_uptodate_lock, flags);
774 return;
779 * Ok, this is getblk, and it isn't very clear, again to hinder
780 * race-conditions. Most of the code is seldom used, (ie repeating),
781 * so it should be much more efficient than it looks.
783 * The algorithm is changed: hopefully better, and an elusive bug removed.
785 * 14.02.92: changed it to sync dirty buffers a bit: better performance
786 * when the filesystem starts to get full of dirty blocks (I hope).
788 struct buffer_head * getblk(kdev_t dev, int block, int size)
790 struct buffer_head * bh;
791 int isize;
793 repeat:
794 bh = get_hash_table(dev, block, size);
795 if (bh) {
796 if (!buffer_dirty(bh)) {
797 bh->b_flushtime = 0;
799 goto out;
802 isize = BUFSIZE_INDEX(size);
803 spin_lock(&free_list[isize].lock);
804 bh = free_list[isize].list;
805 if (bh) {
806 __remove_from_free_list(bh, isize);
807 atomic_set(&bh->b_count, 1);
809 spin_unlock(&free_list[isize].lock);
810 if (!bh)
811 goto refill;
813 /* OK, FINALLY we know that this buffer is the only one of its kind,
814 * we hold a reference (b_count>0), it is unlocked, and it is clean.
816 init_buffer(bh, end_buffer_io_sync, NULL);
817 bh->b_dev = dev;
818 bh->b_blocknr = block;
819 bh->b_state = 1 << BH_Mapped;
821 /* Insert the buffer into the regular lists */
822 insert_into_queues(bh);
823 goto out;
826 * If we block while refilling the free list, somebody may
827 * create the buffer first ... search the hashes again.
829 refill:
830 refill_freelist(size);
831 goto repeat;
832 out:
833 return bh;
837 * if a new dirty buffer is created we need to balance bdflush.
839 * in the future we might want to make bdflush aware of different
840 * pressures on different devices - thus the (currently unused)
841 * 'dev' parameter.
843 int too_many_dirty_buffers;
845 void balance_dirty(kdev_t dev)
847 int dirty = nr_buffers_type[BUF_DIRTY];
848 int ndirty = bdf_prm.b_un.ndirty;
850 if (dirty > ndirty) {
851 if (dirty > 2*ndirty) {
852 too_many_dirty_buffers = 1;
853 wakeup_bdflush(1);
854 return;
856 wakeup_bdflush(0);
858 too_many_dirty_buffers = 0;
859 return;
862 static inline void __mark_dirty(struct buffer_head *bh, int flag)
864 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
865 clear_bit(BH_New, &bh->b_state);
866 refile_buffer(bh);
869 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
871 __mark_dirty(bh, flag);
875 * A buffer may need to be moved from one buffer list to another
876 * (e.g. in case it is not shared any more). Handle this.
878 static __inline__ void __refile_buffer(struct buffer_head *bh)
880 int dispose = BUF_CLEAN;
881 if (buffer_locked(bh))
882 dispose = BUF_LOCKED;
883 if (buffer_dirty(bh))
884 dispose = BUF_DIRTY;
885 if (dispose != bh->b_list) {
886 __remove_from_lru_list(bh, bh->b_list);
887 bh->b_list = dispose;
888 __insert_into_lru_list(bh, dispose);
892 void refile_buffer(struct buffer_head *bh)
894 spin_lock(&lru_list_lock);
895 __refile_buffer(bh);
896 spin_unlock(&lru_list_lock);
900 * Release a buffer head
902 void __brelse(struct buffer_head * buf)
904 touch_buffer(buf);
906 if (atomic_read(&buf->b_count)) {
907 atomic_dec(&buf->b_count);
908 return;
910 printk("VFS: brelse: Trying to free free buffer\n");
914 * bforget() is like brelse(), except it puts the buffer on the
915 * free list if it can.. We can NOT free the buffer if:
916 * - there are other users of it
917 * - it is locked and thus can have active IO
919 void __bforget(struct buffer_head * buf)
921 spin_lock(&lru_list_lock);
922 write_lock(&hash_table_lock);
923 if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
924 touch_buffer(buf);
925 atomic_dec(&buf->b_count);
926 } else {
927 atomic_set(&buf->b_count, 0);
928 buf->b_state = 0;
929 if (buf->b_pprev)
930 __hash_unlink(buf);
931 __remove_from_lru_list(buf, buf->b_list);
932 put_last_free(buf);
934 write_unlock(&hash_table_lock);
935 spin_unlock(&lru_list_lock);
939 * bread() reads a specified block and returns the buffer that contains
940 * it. It returns NULL if the block was unreadable.
942 struct buffer_head * bread(kdev_t dev, int block, int size)
944 struct buffer_head * bh;
946 bh = getblk(dev, block, size);
947 if (buffer_uptodate(bh))
948 return bh;
949 ll_rw_block(READ, 1, &bh);
950 wait_on_buffer(bh);
951 if (buffer_uptodate(bh))
952 return bh;
953 brelse(bh);
954 return NULL;
958 * Ok, breada can be used as bread, but additionally to mark other
959 * blocks for reading as well. End the argument list with a negative
960 * number.
963 #define NBUF 16
965 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
966 unsigned int pos, unsigned int filesize)
968 struct buffer_head * bhlist[NBUF];
969 unsigned int blocks;
970 struct buffer_head * bh;
971 int index;
972 int i, j;
974 if (pos >= filesize)
975 return NULL;
977 if (block < 0)
978 return NULL;
980 bh = getblk(dev, block, bufsize);
981 index = BUFSIZE_INDEX(bh->b_size);
983 if (buffer_uptodate(bh))
984 return(bh);
985 else ll_rw_block(READ, 1, &bh);
987 blocks = (filesize - pos) >> (9+index);
989 if (blocks < (read_ahead[MAJOR(dev)] >> index))
990 blocks = read_ahead[MAJOR(dev)] >> index;
991 if (blocks > NBUF)
992 blocks = NBUF;
994 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
996 bhlist[0] = bh;
997 j = 1;
998 for(i=1; i<blocks; i++) {
999 bh = getblk(dev,block+i,bufsize);
1000 if (buffer_uptodate(bh)) {
1001 brelse(bh);
1002 break;
1004 else bhlist[j++] = bh;
1007 /* Request the read for these buffers, and then release them. */
1008 if (j>1)
1009 ll_rw_block(READA, (j-1), bhlist+1);
1010 for(i=1; i<j; i++)
1011 brelse(bhlist[i]);
1013 /* Wait for this buffer, and then continue on. */
1014 bh = bhlist[0];
1015 wait_on_buffer(bh);
1016 if (buffer_uptodate(bh))
1017 return bh;
1018 brelse(bh);
1019 return NULL;
1023 * Note: the caller should wake up the buffer_wait list if needed.
1025 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1027 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1028 kmem_cache_free(bh_cachep, bh);
1029 } else {
1030 bh->b_blocknr = -1;
1031 init_waitqueue_head(&bh->b_wait);
1032 nr_unused_buffer_heads++;
1033 bh->b_next_free = unused_list;
1034 bh->b_this_page = NULL;
1035 unused_list = bh;
1039 static void put_unused_buffer_head(struct buffer_head *bh)
1041 spin_lock(&unused_list_lock);
1042 __put_unused_buffer_head(bh);
1043 spin_unlock(&unused_list_lock);
1047 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1048 * no-buffer-head deadlock. Return NULL on failure; waiting for
1049 * buffer heads is now handled in create_buffers().
1051 static struct buffer_head * get_unused_buffer_head(int async)
1053 struct buffer_head * bh;
1055 spin_lock(&unused_list_lock);
1056 if (nr_unused_buffer_heads > NR_RESERVED) {
1057 bh = unused_list;
1058 unused_list = bh->b_next_free;
1059 nr_unused_buffer_heads--;
1060 spin_unlock(&unused_list_lock);
1061 return bh;
1063 spin_unlock(&unused_list_lock);
1065 /* This is critical. We can't swap out pages to get
1066 * more buffer heads, because the swap-out may need
1067 * more buffer-heads itself. Thus SLAB_BUFFER.
1069 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1070 memset(bh, 0, sizeof(*bh));
1071 init_waitqueue_head(&bh->b_wait);
1072 return bh;
1076 * If we need an async buffer, use the reserved buffer heads.
1078 if (async) {
1079 spin_lock(&unused_list_lock);
1080 if (unused_list) {
1081 bh = unused_list;
1082 unused_list = bh->b_next_free;
1083 nr_unused_buffer_heads--;
1084 spin_unlock(&unused_list_lock);
1085 return bh;
1087 spin_unlock(&unused_list_lock);
1089 #if 0
1091 * (Pending further analysis ...)
1092 * Ordinary (non-async) requests can use a different memory priority
1093 * to free up pages. Any swapping thus generated will use async
1094 * buffer heads.
1096 if(!async &&
1097 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1098 memset(bh, 0, sizeof(*bh));
1099 init_waitqueue_head(&bh->b_wait);
1100 return bh;
1102 #endif
1104 return NULL;
1108 * Create the appropriate buffers when given a page for data area and
1109 * the size of each buffer.. Use the bh->b_this_page linked list to
1110 * follow the buffers created. Return NULL if unable to create more
1111 * buffers.
1112 * The async flag is used to differentiate async IO (paging, swapping)
1113 * from ordinary buffer allocations, and only async requests are allowed
1114 * to sleep waiting for buffer heads.
1116 static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
1118 DECLARE_WAITQUEUE(wait, current);
1119 struct buffer_head *bh, *head;
1120 long offset;
1122 try_again:
1123 head = NULL;
1124 offset = PAGE_SIZE;
1125 while ((offset -= size) >= 0) {
1126 bh = get_unused_buffer_head(async);
1127 if (!bh)
1128 goto no_grow;
1130 bh->b_dev = B_FREE; /* Flag as unused */
1131 bh->b_this_page = head;
1132 head = bh;
1134 bh->b_state = 0;
1135 bh->b_next_free = NULL;
1136 bh->b_pprev = NULL;
1137 atomic_set(&bh->b_count, 0);
1138 bh->b_size = size;
1140 bh->b_data = (char *) (page+offset);
1141 bh->b_list = BUF_CLEAN;
1142 bh->b_flushtime = 0;
1143 bh->b_end_io = end_buffer_io_bad;
1145 return head;
1147 * In case anything failed, we just free everything we got.
1149 no_grow:
1150 if (head) {
1151 do {
1152 bh = head;
1153 head = head->b_this_page;
1154 put_unused_buffer_head(bh);
1155 } while (head);
1157 /* Wake up any waiters ... */
1158 wake_up(&buffer_wait);
1162 * Return failure for non-async IO requests. Async IO requests
1163 * are not allowed to fail, so we have to wait until buffer heads
1164 * become available. But we don't want tasks sleeping with
1165 * partially complete buffers, so all were released above.
1167 if (!async)
1168 return NULL;
1170 /* We're _really_ low on memory. Now we just
1171 * wait for old buffer heads to become free due to
1172 * finishing IO. Since this is an async request and
1173 * the reserve list is empty, we're sure there are
1174 * async buffer heads in use.
1176 run_task_queue(&tq_disk);
1179 * Set our state for sleeping, then check again for buffer heads.
1180 * This ensures we won't miss a wake_up from an interrupt.
1182 add_wait_queue(&buffer_wait, &wait);
1183 current->state = TASK_UNINTERRUPTIBLE;
1184 if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1185 current->policy |= SCHED_YIELD;
1186 schedule();
1188 remove_wait_queue(&buffer_wait, &wait);
1189 current->state = TASK_RUNNING;
1190 goto try_again;
1193 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1195 struct buffer_head *head, *bh, *tail;
1196 int block;
1198 if (!PageLocked(page))
1199 BUG();
1200 if (page->owner != current)
1201 PAGE_BUG(page);
1203 * Allocate async buffer heads pointing to this page, just for I/O.
1204 * They show up in the buffer hash table and are registered in
1205 * page->buffers.
1207 head = create_buffers(page_address(page), size, 1);
1208 if (page->buffers)
1209 BUG();
1210 if (!head)
1211 BUG();
1212 tail = head;
1213 for (bh = head; bh; bh = bh->b_this_page) {
1214 block = *(b++);
1216 tail = bh;
1217 init_buffer(bh, end_buffer_io_async, NULL);
1218 bh->b_dev = dev;
1219 bh->b_blocknr = block;
1222 * When we use bmap, we define block zero to represent
1223 * a hole. ll_rw_page, however, may legitimately
1224 * access block zero, and we need to distinguish the
1225 * two cases.
1227 if (bmap && !block) {
1228 memset(bh->b_data, 0, size);
1229 set_bit(BH_Uptodate, &bh->b_state);
1230 continue;
1232 set_bit(BH_Mapped, &bh->b_state);
1234 tail->b_this_page = head;
1235 get_page(page);
1236 page->buffers = head;
1237 return 0;
1241 * We don't have to release all buffers here, but
1242 * we have to be sure that no dirty buffer is left
1243 * and no IO is going on (no buffer is locked), because
1244 * we have truncated the file and are going to free the
1245 * blocks on-disk..
1247 int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1249 struct buffer_head *head, *bh, *next;
1250 unsigned int curr_off = 0;
1252 if (!PageLocked(page))
1253 BUG();
1254 if (!page->buffers)
1255 return 0;
1257 head = page->buffers;
1258 bh = head;
1259 do {
1260 unsigned int next_off = curr_off + bh->b_size;
1261 next = bh->b_this_page;
1264 * is this block fully flushed?
1266 if (offset <= curr_off) {
1267 if (buffer_mapped(bh)) {
1268 atomic_inc(&bh->b_count);
1269 wait_on_buffer(bh);
1270 if (bh->b_dev == B_FREE)
1271 BUG();
1272 mark_buffer_clean(bh);
1273 clear_bit(BH_Uptodate, &bh->b_state);
1274 clear_bit(BH_Mapped, &bh->b_state);
1275 clear_bit(BH_Req, &bh->b_state);
1276 bh->b_blocknr = 0;
1277 atomic_dec(&bh->b_count);
1280 curr_off = next_off;
1281 bh = next;
1282 } while (bh != head);
1285 * subtle. We release buffer-heads only if this is
1286 * the 'final' flushpage. We have invalidated the bmap
1287 * cached value unconditionally, so real IO is not
1288 * possible anymore.
1290 * If the free doesn't work out, the buffers can be
1291 * left around - they just turn into anonymous buffers
1292 * instead.
1294 if (!offset) {
1295 if (!try_to_free_buffers(page))
1296 atomic_add(PAGE_CACHE_SIZE, &buffermem);
1299 return 0;
1302 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1304 struct buffer_head *bh, *head, *tail;
1306 head = create_buffers(page_address(page), blocksize, 1);
1307 if (page->buffers)
1308 BUG();
1310 bh = head;
1311 do {
1312 bh->b_dev = inode->i_dev;
1313 bh->b_blocknr = 0;
1314 bh->b_end_io = end_buffer_io_bad;
1315 tail = bh;
1316 bh = bh->b_this_page;
1317 } while (bh);
1318 tail->b_this_page = head;
1319 page->buffers = head;
1320 get_page(page);
1324 * block_write_full_page() is SMP-safe - currently it's still
1325 * being called with the kernel lock held, but the code is ready.
1327 int block_write_full_page(struct file *file, struct page *page)
1329 struct dentry *dentry = file->f_dentry;
1330 struct inode *inode = dentry->d_inode;
1331 int err, i;
1332 unsigned long block, offset;
1333 struct buffer_head *bh, *head;
1335 if (!PageLocked(page))
1336 BUG();
1338 if (!page->buffers)
1339 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1340 head = page->buffers;
1342 offset = page->offset;
1343 block = offset >> inode->i_sb->s_blocksize_bits;
1345 // FIXME: currently we assume page alignment.
1346 if (offset & (PAGE_SIZE-1))
1347 BUG();
1349 bh = head;
1350 i = 0;
1351 do {
1352 if (!bh)
1353 BUG();
1356 * If the buffer isn't up-to-date, we can't be sure
1357 * that the buffer has been initialized with the proper
1358 * block number information etc..
1360 * Leave it to the low-level FS to make all those
1361 * decisions (block #0 may actually be a valid block)
1363 bh->b_end_io = end_buffer_io_sync;
1364 if (!buffer_mapped(bh)) {
1365 err = inode->i_op->get_block(inode, block, bh, 1);
1366 if (err)
1367 goto out;
1369 set_bit(BH_Uptodate, &bh->b_state);
1370 mark_buffer_dirty(bh,0);
1372 bh = bh->b_this_page;
1373 block++;
1374 } while (bh != head);
1376 SetPageUptodate(page);
1377 return 0;
1378 out:
1379 ClearPageUptodate(page);
1380 return err;
1383 int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
1385 struct dentry *dentry = file->f_dentry;
1386 struct inode *inode = dentry->d_inode;
1387 unsigned long block;
1388 int err, partial;
1389 unsigned long blocksize, start_block, end_block;
1390 unsigned long start_offset, start_bytes, end_bytes;
1391 unsigned long bbits, blocks, i, len;
1392 struct buffer_head *bh, *head;
1393 char * target_buf;
1395 target_buf = (char *)page_address(page) + offset;
1397 if (!PageLocked(page))
1398 BUG();
1400 blocksize = inode->i_sb->s_blocksize;
1401 if (!page->buffers)
1402 create_empty_buffers(page, inode, blocksize);
1403 head = page->buffers;
1405 bbits = inode->i_sb->s_blocksize_bits;
1406 block = page->offset >> bbits;
1407 blocks = PAGE_SIZE >> bbits;
1408 start_block = offset >> bbits;
1409 end_block = (offset + bytes - 1) >> bbits;
1410 start_offset = offset & (blocksize - 1);
1411 start_bytes = blocksize - start_offset;
1412 if (start_bytes > bytes)
1413 start_bytes = bytes;
1414 end_bytes = (offset+bytes) & (blocksize - 1);
1415 if (end_bytes > bytes)
1416 end_bytes = bytes;
1418 if (offset < 0 || offset >= PAGE_SIZE)
1419 BUG();
1420 if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1421 BUG();
1422 if (start_block < 0 || start_block >= blocks)
1423 BUG();
1424 if (end_block < 0 || end_block >= blocks)
1425 BUG();
1426 // FIXME: currently we assume page alignment.
1427 if (page->offset & (PAGE_SIZE-1))
1428 BUG();
1430 i = 0;
1431 bh = head;
1432 partial = 0;
1433 do {
1434 if (!bh)
1435 BUG();
1437 if ((i < start_block) || (i > end_block)) {
1438 if (!buffer_uptodate(bh))
1439 partial = 1;
1440 goto skip;
1444 * If the buffer is not up-to-date, we need to ask the low-level
1445 * FS to do something for us (we used to have assumptions about
1446 * the meaning of b_blocknr etc, that's bad).
1448 * If "update" is set, that means that the low-level FS should
1449 * try to make sure that the block is up-to-date because we're
1450 * not going to fill it completely.
1452 bh->b_end_io = end_buffer_io_sync;
1453 if (!buffer_mapped(bh)) {
1454 err = inode->i_op->get_block(inode, block, bh, 1);
1455 if (err)
1456 goto out;
1459 if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1460 if (buffer_new(bh)) {
1461 memset(bh->b_data, 0, bh->b_size);
1462 } else {
1463 ll_rw_block(READ, 1, &bh);
1464 wait_on_buffer(bh);
1465 err = -EIO;
1466 if (!buffer_uptodate(bh))
1467 goto out;
1471 len = blocksize;
1472 if (start_offset) {
1473 len = start_bytes;
1474 start_offset = 0;
1475 } else if (end_bytes && (i == end_block)) {
1476 len = end_bytes;
1477 end_bytes = 0;
1479 err = copy_from_user(target_buf, buf, len);
1480 target_buf += len;
1481 buf += len;
1484 * we dirty buffers only after copying the data into
1485 * the page - this way we can dirty the buffer even if
1486 * the bh is still doing IO.
1488 * NOTE! This also does a direct dirty balace check,
1489 * rather than relying on bdflush just waking up every
1490 * once in a while. This is to catch (and slow down)
1491 * the processes that write tons of buffer..
1493 * Note how we do NOT want to do this in the full block
1494 * case: full pages are flushed not by the people who
1495 * dirtied them, but by people who need memory. And we
1496 * should not penalize them for somebody else writing
1497 * lots of dirty pages.
1499 set_bit(BH_Uptodate, &bh->b_state);
1500 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1501 __mark_dirty(bh, 0);
1502 if (too_many_dirty_buffers)
1503 balance_dirty(bh->b_dev);
1506 if (err) {
1507 err = -EFAULT;
1508 goto out;
1511 skip:
1512 i++;
1513 block++;
1514 bh = bh->b_this_page;
1515 } while (bh != head);
1518 * is this a partial write that happened to make all buffers
1519 * uptodate then we can optimize away a bogus readpage() for
1520 * the next read(). Here we 'discover' wether the page went
1521 * uptodate as a result of this (potentially partial) write.
1523 if (!partial)
1524 SetPageUptodate(page);
1525 return bytes;
1526 out:
1527 ClearPageUptodate(page);
1528 return err;
1533 * IO completion routine for a buffer_head being used for kiobuf IO: we
1534 * can't dispatch the kiobuf callback until io_count reaches 0.
1537 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1539 struct kiobuf *kiobuf;
1541 mark_buffer_uptodate(bh, uptodate);
1543 kiobuf = bh->b_kiobuf;
1544 if (atomic_dec_and_test(&kiobuf->io_count))
1545 kiobuf->end_io(kiobuf);
1546 if (!uptodate)
1547 kiobuf->errno = -EIO;
1552 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1553 * for them to complete. Clean up the buffer_heads afterwards.
1556 #define dprintk(x...)
1558 static int do_kio(struct kiobuf *kiobuf,
1559 int rw, int nr, struct buffer_head *bh[], int size)
1561 int iosize;
1562 int i;
1563 struct buffer_head *tmp;
1565 struct task_struct *tsk = current;
1566 DECLARE_WAITQUEUE(wait, tsk);
1568 dprintk ("do_kio start %d\n", rw);
1570 if (rw == WRITE)
1571 rw = WRITERAW;
1572 atomic_add(nr, &kiobuf->io_count);
1573 kiobuf->errno = 0;
1574 ll_rw_block(rw, nr, bh);
1576 kiobuf_wait_for_io(kiobuf);
1578 spin_lock(&unused_list_lock);
1580 iosize = 0;
1581 for (i = nr; --i >= 0; ) {
1582 iosize += size;
1583 tmp = bh[i];
1584 if (!buffer_uptodate(tmp)) {
1585 /* We are traversing bh'es in reverse order so
1586 clearing iosize on error calculates the
1587 amount of IO before the first error. */
1588 iosize = 0;
1590 __put_unused_buffer_head(tmp);
1593 spin_unlock(&unused_list_lock);
1595 dprintk ("do_kio end %d %d\n", iosize, err);
1597 if (iosize)
1598 return iosize;
1599 if (kiobuf->errno)
1600 return kiobuf->errno;
1601 return -EIO;
1605 * Start I/O on a physical range of kernel memory, defined by a vector
1606 * of kiobuf structs (much like a user-space iovec list).
1608 * The kiobuf must already be locked for IO. IO is submitted
1609 * asynchronously: you need to check page->locked, page->uptodate, and
1610 * maybe wait on page->wait.
1612 * It is up to the caller to make sure that there are enough blocks
1613 * passed in to completely map the iobufs to disk.
1616 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1617 kdev_t dev, unsigned long b[], int size, int bmap)
1619 int err;
1620 int length;
1621 int transferred;
1622 int i;
1623 int bufind;
1624 int pageind;
1625 int bhind;
1626 int offset;
1627 unsigned long blocknr;
1628 struct kiobuf * iobuf = NULL;
1629 unsigned long page;
1630 struct page * map;
1631 struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1633 if (!nr)
1634 return 0;
1637 * First, do some alignment and validity checks
1639 for (i = 0; i < nr; i++) {
1640 iobuf = iovec[i];
1641 if ((iobuf->offset & (size-1)) ||
1642 (iobuf->length & (size-1)))
1643 return -EINVAL;
1644 if (!iobuf->locked)
1645 panic("brw_kiovec: iobuf not locked for I/O");
1646 if (!iobuf->nr_pages)
1647 panic("brw_kiovec: iobuf not initialised");
1650 /* DEBUG */
1651 #if 0
1652 return iobuf->length;
1653 #endif
1654 dprintk ("brw_kiovec: start\n");
1657 * OK to walk down the iovec doing page IO on each page we find.
1659 bufind = bhind = transferred = err = 0;
1660 for (i = 0; i < nr; i++) {
1661 iobuf = iovec[i];
1662 offset = iobuf->offset;
1663 length = iobuf->length;
1664 dprintk ("iobuf %d %d %d\n", offset, length, size);
1666 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1667 page = iobuf->pagelist[pageind];
1668 map = iobuf->maplist[pageind];
1670 while (length > 0) {
1671 blocknr = b[bufind++];
1672 tmp = get_unused_buffer_head(0);
1673 if (!tmp) {
1674 err = -ENOMEM;
1675 goto error;
1678 tmp->b_dev = B_FREE;
1679 tmp->b_size = size;
1680 tmp->b_data = (char *) (page + offset);
1681 tmp->b_this_page = tmp;
1683 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1684 tmp->b_dev = dev;
1685 tmp->b_blocknr = blocknr;
1686 tmp->b_state = 1 << BH_Mapped;
1687 tmp->b_kiobuf = iobuf;
1689 if (rw == WRITE) {
1690 set_bit(BH_Uptodate, &tmp->b_state);
1691 set_bit(BH_Dirty, &tmp->b_state);
1694 dprintk ("buffer %d (%d) at %p\n",
1695 bhind, tmp->b_blocknr, tmp->b_data);
1696 bh[bhind++] = tmp;
1697 length -= size;
1698 offset += size;
1701 * Start the IO if we have got too much
1703 if (bhind >= KIO_MAX_SECTORS) {
1704 err = do_kio(iobuf, rw, bhind, bh, size);
1705 if (err >= 0)
1706 transferred += err;
1707 else
1708 goto finished;
1709 bhind = 0;
1712 if (offset >= PAGE_SIZE) {
1713 offset = 0;
1714 break;
1716 } /* End of block loop */
1717 } /* End of page loop */
1718 } /* End of iovec loop */
1720 /* Is there any IO still left to submit? */
1721 if (bhind) {
1722 err = do_kio(iobuf, rw, bhind, bh, size);
1723 if (err >= 0)
1724 transferred += err;
1725 else
1726 goto finished;
1729 finished:
1730 dprintk ("brw_kiovec: end (%d, %d)\n", transferred, err);
1731 if (transferred)
1732 return transferred;
1733 return err;
1735 error:
1736 /* We got an error allocation the bh'es. Just free the current
1737 buffer_heads and exit. */
1738 spin_lock(&unused_list_lock);
1739 for (i = bhind; --i >= 0; ) {
1740 __put_unused_buffer_head(bh[bhind]);
1742 spin_unlock(&unused_list_lock);
1743 goto finished;
1747 * Start I/O on a page.
1748 * This function expects the page to be locked and may return
1749 * before I/O is complete. You then have to check page->locked,
1750 * page->uptodate, and maybe wait on page->wait.
1752 * brw_page() is SMP-safe, although it's being called with the
1753 * kernel lock held - but the code is ready.
1755 * FIXME: we need a swapper_inode->get_block function to remove
1756 * some of the bmap kludges and interface ugliness here.
1758 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1760 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1761 int nr, fresh /* temporary debugging flag */, block;
1763 if (!PageLocked(page))
1764 panic("brw_page: page not locked for I/O");
1765 // clear_bit(PG_error, &page->flags);
1767 * We pretty much rely on the page lock for this, because
1768 * create_page_buffers() might sleep.
1770 fresh = 0;
1771 if (!page->buffers) {
1772 create_page_buffers(rw, page, dev, b, size, bmap);
1773 fresh = 1;
1775 if (!page->buffers)
1776 BUG();
1777 page->owner = (void *)-1;
1779 head = page->buffers;
1780 bh = head;
1781 nr = 0;
1782 do {
1783 block = *(b++);
1785 if (fresh && (atomic_read(&bh->b_count) != 0))
1786 BUG();
1787 if (rw == READ) {
1788 if (!fresh)
1789 BUG();
1790 if (bmap && !block) {
1791 if (block)
1792 BUG();
1793 } else {
1794 if (bmap && !block)
1795 BUG();
1796 if (!buffer_uptodate(bh)) {
1797 arr[nr++] = bh;
1798 atomic_inc(&bh->b_count);
1801 } else { /* WRITE */
1802 if (!bh->b_blocknr) {
1803 if (!block)
1804 BUG();
1805 bh->b_blocknr = block;
1806 } else {
1807 if (!block)
1808 BUG();
1810 set_bit(BH_Uptodate, &bh->b_state);
1811 set_bit(BH_Dirty, &bh->b_state);
1812 arr[nr++] = bh;
1813 atomic_inc(&bh->b_count);
1815 bh = bh->b_this_page;
1816 } while (bh != head);
1817 if (rw == READ)
1818 ++current->maj_flt;
1819 if ((rw == READ) && nr) {
1820 if (Page_Uptodate(page))
1821 BUG();
1822 ll_rw_block(rw, nr, arr);
1823 } else {
1824 if (!nr && rw == READ) {
1825 SetPageUptodate(page);
1826 page->owner = current;
1827 UnlockPage(page);
1829 if (nr && (rw == WRITE))
1830 ll_rw_block(rw, nr, arr);
1832 return 0;
1836 * Generic "read page" function for block devices that have the normal
1837 * bmap functionality. This is most of the block device filesystems.
1838 * Reads the page asynchronously --- the unlock_buffer() and
1839 * mark_buffer_uptodate() functions propagate buffer state into the
1840 * page struct once IO has completed.
1842 int block_read_full_page(struct file * file, struct page * page)
1844 struct dentry *dentry = file->f_dentry;
1845 struct inode *inode = dentry->d_inode;
1846 unsigned long iblock;
1847 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1848 unsigned int blocksize, blocks;
1849 int nr;
1851 if (!PageLocked(page))
1852 PAGE_BUG(page);
1853 blocksize = inode->i_sb->s_blocksize;
1854 if (!page->buffers)
1855 create_empty_buffers(page, inode, blocksize);
1856 head = page->buffers;
1858 blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1859 iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1860 page->owner = (void *)-1;
1861 head = page->buffers;
1862 bh = head;
1863 nr = 0;
1865 do {
1866 if (buffer_uptodate(bh))
1867 continue;
1869 if (!buffer_mapped(bh)) {
1870 inode->i_op->get_block(inode, iblock, bh, 0);
1871 if (!buffer_mapped(bh)) {
1872 memset(bh->b_data, 0, blocksize);
1873 set_bit(BH_Uptodate, &bh->b_state);
1874 continue;
1878 init_buffer(bh, end_buffer_io_async, NULL);
1879 atomic_inc(&bh->b_count);
1880 arr[nr] = bh;
1881 nr++;
1882 } while (iblock++, (bh = bh->b_this_page) != head);
1884 ++current->maj_flt;
1885 if (nr) {
1886 if (Page_Uptodate(page))
1887 BUG();
1888 ll_rw_block(READ, nr, arr);
1889 } else {
1891 * all buffers are uptodate - we can set the page
1892 * uptodate as well.
1894 SetPageUptodate(page);
1895 page->owner = current;
1896 UnlockPage(page);
1898 return 0;
1902 * Try to increase the number of buffers available: the size argument
1903 * is used to determine what kind of buffers we want.
1905 static int grow_buffers(int size)
1907 unsigned long page;
1908 struct buffer_head *bh, *tmp;
1909 struct buffer_head * insert_point;
1910 int isize;
1912 if ((size & 511) || (size > PAGE_SIZE)) {
1913 printk("VFS: grow_buffers: size = %d\n",size);
1914 return 0;
1917 if (!(page = __get_free_page(GFP_BUFFER)))
1918 return 0;
1919 bh = create_buffers(page, size, 0);
1920 if (!bh) {
1921 free_page(page);
1922 return 0;
1925 isize = BUFSIZE_INDEX(size);
1927 spin_lock(&free_list[isize].lock);
1928 insert_point = free_list[isize].list;
1929 tmp = bh;
1930 while (1) {
1931 if (insert_point) {
1932 tmp->b_next_free = insert_point->b_next_free;
1933 tmp->b_prev_free = insert_point;
1934 insert_point->b_next_free->b_prev_free = tmp;
1935 insert_point->b_next_free = tmp;
1936 } else {
1937 tmp->b_prev_free = tmp;
1938 tmp->b_next_free = tmp;
1940 insert_point = tmp;
1941 if (tmp->b_this_page)
1942 tmp = tmp->b_this_page;
1943 else
1944 break;
1946 tmp->b_this_page = bh;
1947 free_list[isize].list = bh;
1948 spin_unlock(&free_list[isize].lock);
1950 mem_map[MAP_NR(page)].buffers = bh;
1951 atomic_add(PAGE_SIZE, &buffermem);
1952 return 1;
1956 * Can the buffer be thrown out?
1958 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1959 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1962 * try_to_free_buffers() checks if all the buffers on this particular page
1963 * are unused, and free's the page if so.
1965 * Wake up bdflush() if this fails - if we're running low on memory due
1966 * to dirty buffers, we need to flush them out as quickly as possible.
1968 * NOTE: There are quite a number of ways that threads of control can
1969 * obtain a reference to a buffer head within a page. So we must
1970 * lock out all of these paths to cleanly toss the page.
1972 int try_to_free_buffers(struct page * page)
1974 struct buffer_head * tmp, * bh = page->buffers;
1975 int index = BUFSIZE_INDEX(bh->b_size);
1976 int ret;
1978 spin_lock(&lru_list_lock);
1979 write_lock(&hash_table_lock);
1980 spin_lock(&free_list[index].lock);
1981 tmp = bh;
1982 do {
1983 struct buffer_head * p = tmp;
1985 tmp = tmp->b_this_page;
1986 if (buffer_busy(p))
1987 goto busy_buffer_page;
1988 } while (tmp != bh);
1990 spin_lock(&unused_list_lock);
1991 tmp = bh;
1992 do {
1993 struct buffer_head * p = tmp;
1994 tmp = tmp->b_this_page;
1996 /* The buffer can be either on the regular
1997 * queues or on the free list..
1999 if (p->b_dev == B_FREE) {
2000 __remove_from_free_list(p, index);
2001 } else {
2002 if (p->b_pprev)
2003 __hash_unlink(p);
2004 __remove_from_lru_list(p, p->b_list);
2006 __put_unused_buffer_head(p);
2007 } while (tmp != bh);
2008 spin_unlock(&unused_list_lock);
2010 /* Wake up anyone waiting for buffer heads */
2011 wake_up(&buffer_wait);
2013 /* And free the page */
2014 page->buffers = NULL;
2015 __free_page(page);
2016 ret = 1;
2017 out:
2018 spin_unlock(&free_list[index].lock);
2019 write_unlock(&hash_table_lock);
2020 spin_unlock(&lru_list_lock);
2021 return ret;
2023 busy_buffer_page:
2024 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2025 too_many_dirty_buffers = 1;
2026 wakeup_bdflush(0);
2027 ret = 0;
2028 goto out;
2031 /* ===================== Init ======================= */
2034 * allocate the hash table and init the free list
2035 * Use gfp() for the hash table to decrease TLB misses, use
2036 * SLAB cache for buffer heads.
2038 void __init buffer_init(unsigned long memory_size)
2040 int order, i;
2041 unsigned int nr_hash;
2043 /* The buffer cache hash table is less important these days,
2044 * trim it a bit.
2046 memory_size >>= 14;
2047 memory_size *= sizeof(struct buffer_head *);
2048 for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
2051 /* try to allocate something until we get it or we're asking
2052 for something that is really too small */
2054 do {
2055 unsigned long tmp;
2057 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2058 bh_hash_mask = (nr_hash - 1);
2060 tmp = nr_hash;
2061 bh_hash_shift = 0;
2062 while((tmp >>= 1UL) != 0UL)
2063 bh_hash_shift++;
2065 hash_table = (struct buffer_head **)
2066 __get_free_pages(GFP_ATOMIC, order);
2067 } while (hash_table == NULL && --order > 0);
2068 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2069 nr_hash, order, (1UL<<order) * PAGE_SIZE);
2071 if (!hash_table)
2072 panic("Failed to allocate buffer hash table\n");
2074 /* Setup hash chains. */
2075 for(i = 0; i < nr_hash; i++)
2076 hash_table[i] = NULL;
2078 /* Setup free lists. */
2079 for(i = 0; i < NR_SIZES; i++) {
2080 free_list[i].list = NULL;
2081 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2084 /* Setup lru lists. */
2085 for(i = 0; i < NR_LIST; i++)
2086 lru_list[i] = NULL;
2088 bh_cachep = kmem_cache_create("buffer_head",
2089 sizeof(struct buffer_head),
2091 SLAB_HWCACHE_ALIGN, NULL, NULL);
2092 if(!bh_cachep)
2093 panic("Cannot create buffer head SLAB cache\n");
2097 /* ====================== bdflush support =================== */
2099 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2100 * response to dirty buffers. Once this process is activated, we write back
2101 * a limited number of buffers to the disks and then go back to sleep again.
2103 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2104 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2105 struct task_struct *bdflush_tsk = 0;
2107 void wakeup_bdflush(int wait)
2109 if (current == bdflush_tsk)
2110 return;
2111 if (wait)
2112 run_task_queue(&tq_disk);
2113 wake_up(&bdflush_wait);
2114 if (wait)
2115 sleep_on(&bdflush_done);
2120 * Here we attempt to write back old buffers. We also try to flush inodes
2121 * and supers as well, since this function is essentially "update", and
2122 * otherwise there would be no way of ensuring that these quantities ever
2123 * get written back. Ideally, we would have a timestamp on the inodes
2124 * and superblocks so that we could write back only the old ones as well
2127 static int sync_old_buffers(void)
2129 int nlist;
2131 lock_kernel();
2132 sync_supers(0);
2133 sync_inodes(0);
2134 unlock_kernel();
2136 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2137 struct buffer_head *bh;
2138 repeat:
2139 spin_lock(&lru_list_lock);
2140 bh = lru_list[nlist];
2141 if(bh) {
2142 struct buffer_head *next;
2143 int i;
2144 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
2145 next = bh->b_next_free;
2147 /* If the buffer is not on the proper list,
2148 * then refile it.
2150 if ((nlist == BUF_DIRTY &&
2151 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2152 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2153 __refile_buffer(bh);
2154 continue;
2157 if (buffer_locked(bh) || !buffer_dirty(bh))
2158 continue;
2160 /* OK, now we are committed to write it out. */
2161 bh->b_flushtime = 0;
2162 atomic_inc(&bh->b_count);
2163 spin_unlock(&lru_list_lock);
2164 ll_rw_block(WRITE, 1, &bh);
2165 atomic_dec(&bh->b_count);
2166 goto repeat;
2169 spin_unlock(&lru_list_lock);
2171 run_task_queue(&tq_disk);
2172 return 0;
2175 /* This is the interface to bdflush. As we get more sophisticated, we can
2176 * pass tuning parameters to this "process", to adjust how it behaves.
2177 * We would want to verify each parameter, however, to make sure that it
2178 * is reasonable. */
2180 asmlinkage int sys_bdflush(int func, long data)
2182 if (!capable(CAP_SYS_ADMIN))
2183 return -EPERM;
2185 if (func == 1) {
2186 int error;
2187 struct mm_struct *user_mm;
2190 * bdflush will spend all of it's time in kernel-space,
2191 * without touching user-space, so we can switch it into
2192 * 'lazy TLB mode' to reduce the cost of context-switches
2193 * to and from bdflush.
2195 user_mm = start_lazy_tlb();
2196 error = sync_old_buffers();
2197 end_lazy_tlb(user_mm);
2198 return error;
2201 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2202 if (func >= 2) {
2203 int i = (func-2) >> 1;
2204 if (i >= 0 && i < N_PARAM) {
2205 if ((func & 1) == 0)
2206 return put_user(bdf_prm.data[i], (int*)data);
2208 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2209 bdf_prm.data[i] = data;
2210 return 0;
2213 return -EINVAL;
2216 /* Having func 0 used to launch the actual bdflush and then never
2217 * return (unless explicitly killed). We return zero here to
2218 * remain semi-compatible with present update(8) programs.
2220 return 0;
2224 * This is the actual bdflush daemon itself. It used to be started from
2225 * the syscall above, but now we launch it ourselves internally with
2226 * kernel_thread(...) directly after the first thread in init/main.c
2228 int bdflush(void * unused)
2231 * We have a bare-bones task_struct, and really should fill
2232 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2233 * display semi-sane things. Not real crucial though...
2236 current->session = 1;
2237 current->pgrp = 1;
2238 sprintf(current->comm, "kflushd");
2239 bdflush_tsk = current;
2241 for (;;) {
2242 int nlist;
2244 CHECK_EMERGENCY_SYNC
2246 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2247 int nr, major, written = 0;
2248 struct buffer_head *next;
2250 repeat:
2251 spin_lock(&lru_list_lock);
2252 next = lru_list[nlist];
2253 nr = nr_buffers_type[nlist];
2254 while (nr-- > 0) {
2255 struct buffer_head *bh = next;
2257 next = next->b_next_free;
2259 /* If the buffer is not on the correct list,
2260 * then refile it.
2262 if ((nlist == BUF_DIRTY &&
2263 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2264 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2265 __refile_buffer(bh);
2266 continue;
2269 /* If we aren't in panic mode, don't write out too much
2270 * at a time. Also, don't write out buffers we don't
2271 * really have to write out yet..
2273 if (!too_many_dirty_buffers) {
2274 if (written > bdf_prm.b_un.ndirty)
2275 break;
2276 if (time_before(jiffies, bh->b_flushtime))
2277 continue;
2280 if (buffer_locked(bh) || !buffer_dirty(bh))
2281 continue;
2283 major = MAJOR(bh->b_dev);
2284 written++;
2285 bh->b_flushtime = 0;
2288 * For the loop major we can try to do asynchronous writes,
2289 * but we have to guarantee that we're making some progress..
2291 atomic_inc(&bh->b_count);
2292 spin_unlock(&lru_list_lock);
2293 if (major == LOOP_MAJOR && written > 1) {
2294 ll_rw_block(WRITEA, 1, &bh);
2295 if (buffer_dirty(bh))
2296 --written;
2297 } else
2298 ll_rw_block(WRITE, 1, &bh);
2299 atomic_dec(&bh->b_count);
2300 goto repeat;
2302 spin_unlock(&lru_list_lock);
2304 run_task_queue(&tq_disk);
2305 wake_up(&bdflush_done);
2308 * If there are still a lot of dirty buffers around,
2309 * skip the sleep and flush some more. Otherwise, we
2310 * sleep for a while and mark us as not being in panic
2311 * mode..
2313 if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2314 too_many_dirty_buffers = 0;
2315 spin_lock_irq(&current->sigmask_lock);
2316 flush_signals(current);
2317 spin_unlock_irq(&current->sigmask_lock);
2318 interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);