Import 2.3.12pre1
[davej-history.git] / fs / buffer.c
blobf578992b3b2f0f37a3fb862c13fcfe0b17c23163
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 #include <linux/sched.h>
30 #include <linux/fs.h>
31 #include <linux/malloc.h>
32 #include <linux/locks.h>
33 #include <linux/errno.h>
34 #include <linux/swap.h>
35 #include <linux/swapctl.h>
36 #include <linux/smp_lock.h>
37 #include <linux/vmalloc.h>
38 #include <linux/blkdev.h>
39 #include <linux/sysrq.h>
40 #include <linux/file.h>
41 #include <linux/init.h>
42 #include <linux/quotaops.h>
44 #include <asm/uaccess.h>
45 #include <asm/io.h>
46 #include <asm/bitops.h>
47 #include <asm/mmu_context.h>
49 #define NR_SIZES 7
50 static char buffersize_index[65] =
51 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
52 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
53 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
54 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
55 6};
57 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
58 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
59 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
60 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
61 number of unused buffer heads */
63 /* Anti-deadlock ordering:
64 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
68 * Hash table gook..
70 static unsigned int bh_hash_mask = 0;
71 static unsigned int bh_hash_shift = 0;
72 static struct buffer_head **hash_table;
73 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
75 static struct buffer_head *lru_list[NR_LIST];
76 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
77 static int nr_buffers_type[NR_LIST] = {0,};
79 static struct buffer_head * unused_list = NULL;
80 static int nr_unused_buffer_heads = 0;
81 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
82 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
84 struct bh_free_head {
85 struct buffer_head *list;
86 spinlock_t lock;
88 static struct bh_free_head free_list[NR_SIZES];
90 static kmem_cache_t *bh_cachep;
92 static int grow_buffers(int size);
94 /* This is used by some architectures to estimate available memory. */
95 atomic_t buffermem = ATOMIC_INIT(0);
97 /* Here is the parameter block for the bdflush process. If you add or
98 * remove any of the parameters, make sure to update kernel/sysctl.c.
101 #define N_PARAM 9
103 /* The dummy values in this structure are left in there for compatibility
104 * with old programs that play with the /proc entries.
106 union bdflush_param {
107 struct {
108 int nfract; /* Percentage of buffer cache dirty to
109 activate bdflush */
110 int ndirty; /* Maximum number of dirty blocks to write out per
111 wake-cycle */
112 int nrefill; /* Number of clean buffers to try to obtain
113 each time we call refill */
114 int nref_dirt; /* Dirty buffer threshold for activating bdflush
115 when trying to refill buffers. */
116 int dummy1; /* unused */
117 int age_buffer; /* Time for normal buffer to age before we flush it */
118 int age_super; /* Time for superblock to age before we flush it */
119 int dummy2; /* unused */
120 int dummy3; /* unused */
121 } b_un;
122 unsigned int data[N_PARAM];
123 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
125 /* These are the min and max parameter values that we will allow to be assigned */
126 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
127 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
129 void wakeup_bdflush(int);
132 * Rewrote the wait-routines to use the "new" wait-queue functionality,
133 * and getting rid of the cli-sti pairs. The wait-queue routines still
134 * need cli-sti, but now it's just a couple of 386 instructions or so.
136 * Note that the real wait_on_buffer() is an inline function that checks
137 * if 'b_wait' is set before calling this, so that the queues aren't set
138 * up unnecessarily.
140 void __wait_on_buffer(struct buffer_head * bh)
142 struct task_struct *tsk = current;
143 DECLARE_WAITQUEUE(wait, tsk);
145 atomic_inc(&bh->b_count);
146 add_wait_queue(&bh->b_wait, &wait);
147 repeat:
148 tsk->state = TASK_UNINTERRUPTIBLE;
149 run_task_queue(&tq_disk);
150 if (buffer_locked(bh)) {
151 schedule();
152 goto repeat;
154 tsk->state = TASK_RUNNING;
155 remove_wait_queue(&bh->b_wait, &wait);
156 atomic_dec(&bh->b_count);
159 /* Call sync_buffers with wait!=0 to ensure that the call does not
160 * return until all buffer writes have completed. Sync() may return
161 * before the writes have finished; fsync() may not.
164 /* Godamity-damn. Some buffers (bitmaps for filesystems)
165 * spontaneously dirty themselves without ever brelse being called.
166 * We will ultimately want to put these in a separate list, but for
167 * now we search all of the lists for dirty buffers.
169 static int sync_buffers(kdev_t dev, int wait)
171 int i, retry, pass = 0, err = 0;
172 struct buffer_head * bh, *next;
174 /* One pass for no-wait, three for wait:
175 * 0) write out all dirty, unlocked buffers;
176 * 1) write out all dirty buffers, waiting if locked;
177 * 2) wait for completion by waiting for all buffers to unlock.
179 do {
180 retry = 0;
182 /* We search all lists as a failsafe mechanism, not because we expect
183 * there to be dirty buffers on any of the other lists.
185 repeat:
186 spin_lock(&lru_list_lock);
187 bh = lru_list[BUF_DIRTY];
188 if (!bh)
189 goto repeat2;
191 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
192 next = bh->b_next_free;
194 if (!lru_list[BUF_DIRTY])
195 break;
196 if (dev && bh->b_dev != dev)
197 continue;
198 if (buffer_locked(bh)) {
199 /* Buffer is locked; skip it unless wait is
200 * requested AND pass > 0.
202 if (!wait || !pass) {
203 retry = 1;
204 continue;
206 atomic_inc(&bh->b_count);
207 spin_unlock(&lru_list_lock);
208 wait_on_buffer (bh);
209 atomic_dec(&bh->b_count);
210 goto repeat;
213 /* If an unlocked buffer is not uptodate, there has
214 * been an IO error. Skip it.
216 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
217 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
218 err = -EIO;
219 continue;
222 /* Don't write clean buffers. Don't write ANY buffers
223 * on the third pass.
225 if (!buffer_dirty(bh) || pass >= 2)
226 continue;
228 atomic_inc(&bh->b_count);
229 bh->b_flushtime = 0;
230 spin_unlock(&lru_list_lock);
231 ll_rw_block(WRITE, 1, &bh);
232 atomic_dec(&bh->b_count);
233 retry = 1;
234 goto repeat;
237 repeat2:
238 bh = lru_list[BUF_LOCKED];
239 if (!bh) {
240 spin_unlock(&lru_list_lock);
241 break;
243 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
244 next = bh->b_next_free;
246 if (!lru_list[BUF_LOCKED])
247 break;
248 if (dev && bh->b_dev != dev)
249 continue;
250 if (buffer_locked(bh)) {
251 /* Buffer is locked; skip it unless wait is
252 * requested AND pass > 0.
254 if (!wait || !pass) {
255 retry = 1;
256 continue;
258 atomic_inc(&bh->b_count);
259 spin_unlock(&lru_list_lock);
260 wait_on_buffer (bh);
261 spin_lock(&lru_list_lock);
262 atomic_dec(&bh->b_count);
263 goto repeat2;
266 spin_unlock(&lru_list_lock);
268 /* If we are waiting for the sync to succeed, and if any dirty
269 * blocks were written, then repeat; on the second pass, only
270 * wait for buffers being written (do not pass to write any
271 * more buffers on the second pass).
273 } while (wait && retry && ++pass<=2);
274 return err;
277 void sync_dev(kdev_t dev)
279 sync_buffers(dev, 0);
280 sync_supers(dev);
281 sync_inodes(dev);
282 sync_buffers(dev, 0);
283 DQUOT_SYNC(dev);
285 * FIXME(eric) we need to sync the physical devices here.
286 * This is because some (scsi) controllers have huge amounts of
287 * cache onboard (hundreds of Mb), and we need to instruct
288 * them to commit all of the dirty memory to disk, and we should
289 * not return until this has happened.
291 * This would need to get implemented by going through the assorted
292 * layers so that each block major number can be synced, and this
293 * would call down into the upper and mid-layer scsi.
297 int fsync_dev(kdev_t dev)
299 sync_buffers(dev, 0);
301 lock_kernel();
302 sync_supers(dev);
303 sync_inodes(dev);
304 DQUOT_SYNC(dev);
305 unlock_kernel();
307 return sync_buffers(dev, 1);
310 asmlinkage int sys_sync(void)
312 fsync_dev(0);
313 return 0;
317 * filp may be NULL if called via the msync of a vma.
320 int file_fsync(struct file *filp, struct dentry *dentry)
322 struct inode * inode = dentry->d_inode;
323 struct super_block * sb;
324 kdev_t dev;
326 /* sync the inode to buffers */
327 write_inode_now(inode);
329 /* sync the superblock to buffers */
330 sb = inode->i_sb;
331 wait_on_super(sb);
332 if (sb->s_op && sb->s_op->write_super)
333 sb->s_op->write_super(sb);
335 /* .. finally sync the buffers to disk */
336 dev = inode->i_dev;
337 return sync_buffers(dev, 1);
340 asmlinkage int sys_fsync(unsigned int fd)
342 struct file * file;
343 struct dentry * dentry;
344 struct inode * inode;
345 int err;
347 lock_kernel();
348 err = -EBADF;
349 file = fget(fd);
350 if (!file)
351 goto out;
353 dentry = file->f_dentry;
354 if (!dentry)
355 goto out_putf;
357 inode = dentry->d_inode;
358 if (!inode)
359 goto out_putf;
361 err = -EINVAL;
362 if (!file->f_op || !file->f_op->fsync)
363 goto out_putf;
365 /* We need to protect against concurrent writers.. */
366 down(&inode->i_sem);
367 err = file->f_op->fsync(file, dentry);
368 up(&inode->i_sem);
370 out_putf:
371 fput(file);
372 out:
373 unlock_kernel();
374 return err;
377 asmlinkage int sys_fdatasync(unsigned int fd)
379 struct file * file;
380 struct dentry * dentry;
381 struct inode * inode;
382 int err;
384 lock_kernel();
385 err = -EBADF;
386 file = fget(fd);
387 if (!file)
388 goto out;
390 dentry = file->f_dentry;
391 if (!dentry)
392 goto out_putf;
394 inode = dentry->d_inode;
395 if (!inode)
396 goto out_putf;
398 err = -EINVAL;
399 if (!file->f_op || !file->f_op->fsync)
400 goto out_putf;
402 /* this needs further work, at the moment it is identical to fsync() */
403 down(&inode->i_sem);
404 err = file->f_op->fsync(file, dentry);
405 up(&inode->i_sem);
407 out_putf:
408 fput(file);
409 out:
410 unlock_kernel();
411 return err;
414 void invalidate_buffers(kdev_t dev)
416 int nlist;
418 spin_lock(&lru_list_lock);
419 for(nlist = 0; nlist < NR_LIST; nlist++) {
420 struct buffer_head * bh;
421 int i;
422 retry:
423 bh = lru_list[nlist];
424 if (!bh)
425 continue;
426 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
427 if (bh->b_dev != dev)
428 continue;
429 if (buffer_locked(bh)) {
430 atomic_inc(&bh->b_count);
431 spin_unlock(&lru_list_lock);
432 wait_on_buffer(bh);
433 spin_lock(&lru_list_lock);
434 atomic_dec(&bh->b_count);
435 goto retry;
437 if (atomic_read(&bh->b_count))
438 continue;
439 bh->b_flushtime = 0;
440 clear_bit(BH_Protected, &bh->b_state);
441 clear_bit(BH_Uptodate, &bh->b_state);
442 clear_bit(BH_Dirty, &bh->b_state);
443 clear_bit(BH_Req, &bh->b_state);
446 spin_unlock(&lru_list_lock);
449 /* After several hours of tedious analysis, the following hash
450 * function won. Do not mess with it... -DaveM
452 #define _hashfn(dev,block) \
453 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
454 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
455 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
457 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
459 if ((bh->b_next = *head) != NULL)
460 bh->b_next->b_pprev = &bh->b_next;
461 *head = bh;
462 bh->b_pprev = head;
465 static __inline__ void __hash_unlink(struct buffer_head *bh)
467 if (bh->b_next)
468 bh->b_next->b_pprev = bh->b_pprev;
469 *(bh->b_pprev) = bh->b_next;
470 bh->b_pprev = NULL;
473 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
475 struct buffer_head **bhp = &lru_list[blist];
477 if(!*bhp) {
478 *bhp = bh;
479 bh->b_prev_free = bh;
481 bh->b_next_free = *bhp;
482 bh->b_prev_free = (*bhp)->b_prev_free;
483 (*bhp)->b_prev_free->b_next_free = bh;
484 (*bhp)->b_prev_free = bh;
485 nr_buffers_type[blist]++;
488 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
490 if (bh->b_prev_free || bh->b_next_free) {
491 bh->b_prev_free->b_next_free = bh->b_next_free;
492 bh->b_next_free->b_prev_free = bh->b_prev_free;
493 if (lru_list[blist] == bh)
494 lru_list[blist] = bh->b_next_free;
495 if (lru_list[blist] == bh)
496 lru_list[blist] = NULL;
497 bh->b_next_free = bh->b_prev_free = NULL;
498 nr_buffers_type[blist]--;
502 static void __remove_from_free_list(struct buffer_head * bh, int index)
504 if(bh->b_next_free == bh)
505 free_list[index].list = NULL;
506 else {
507 bh->b_prev_free->b_next_free = bh->b_next_free;
508 bh->b_next_free->b_prev_free = bh->b_prev_free;
509 if (free_list[index].list == bh)
510 free_list[index].list = bh->b_next_free;
512 bh->b_next_free = bh->b_prev_free = NULL;
515 /* The following two functions must operate atomically
516 * because they control the visibility of a buffer head
517 * to the rest of the kernel.
519 static __inline__ void __remove_from_queues(struct buffer_head *bh)
521 write_lock(&hash_table_lock);
522 if (bh->b_pprev)
523 __hash_unlink(bh);
524 __remove_from_lru_list(bh, bh->b_list);
525 write_unlock(&hash_table_lock);
528 static void insert_into_queues(struct buffer_head *bh)
530 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
532 spin_lock(&lru_list_lock);
533 write_lock(&hash_table_lock);
534 __hash_link(bh, head);
535 __insert_into_lru_list(bh, bh->b_list);
536 write_unlock(&hash_table_lock);
537 spin_unlock(&lru_list_lock);
540 /* This function must only run if there are no other
541 * references _anywhere_ to this buffer head.
543 static void put_last_free(struct buffer_head * bh)
545 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
546 struct buffer_head **bhp = &head->list;
548 spin_lock(&head->lock);
549 bh->b_dev = B_FREE;
550 if(!*bhp) {
551 *bhp = bh;
552 bh->b_prev_free = bh;
554 bh->b_next_free = *bhp;
555 bh->b_prev_free = (*bhp)->b_prev_free;
556 (*bhp)->b_prev_free->b_next_free = bh;
557 (*bhp)->b_prev_free = bh;
558 spin_unlock(&head->lock);
562 * Why like this, I hear you say... The reason is race-conditions.
563 * As we don't lock buffers (unless we are reading them, that is),
564 * something might happen to it while we sleep (ie a read-error
565 * will force it bad). This shouldn't really happen currently, but
566 * the code is ready.
568 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
570 struct buffer_head **head = &hash(dev, block);
571 struct buffer_head *bh;
573 read_lock(&hash_table_lock);
574 for(bh = *head; bh; bh = bh->b_next)
575 if (bh->b_blocknr == block &&
576 bh->b_size == size &&
577 bh->b_dev == dev)
578 break;
579 if (bh)
580 atomic_inc(&bh->b_count);
581 read_unlock(&hash_table_lock);
583 return bh;
586 unsigned int get_hardblocksize(kdev_t dev)
589 * Get the hard sector size for the given device. If we don't know
590 * what it is, return 0.
592 if (hardsect_size[MAJOR(dev)] != NULL) {
593 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
594 if (blksize != 0)
595 return blksize;
599 * We don't know what the hardware sector size for this device is.
600 * Return 0 indicating that we don't know.
602 return 0;
605 void set_blocksize(kdev_t dev, int size)
607 extern int *blksize_size[];
608 int i, nlist;
609 struct buffer_head * bh, *bhnext;
611 if (!blksize_size[MAJOR(dev)])
612 return;
614 /* Size must be a power of two, and between 512 and PAGE_SIZE */
615 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
616 panic("Invalid blocksize passed to set_blocksize");
618 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
619 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
620 return;
622 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
623 return;
624 sync_buffers(dev, 2);
625 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
627 /* We need to be quite careful how we do this - we are moving entries
628 * around on the free list, and we can get in a loop if we are not careful.
630 for(nlist = 0; nlist < NR_LIST; nlist++) {
631 repeat:
632 spin_lock(&lru_list_lock);
633 bh = lru_list[nlist];
634 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
635 if(!bh)
636 break;
638 bhnext = bh->b_next_free;
639 if (bh->b_dev != dev)
640 continue;
641 if (bh->b_size == size)
642 continue;
643 if (buffer_locked(bh)) {
644 atomic_inc(&bh->b_count);
645 spin_unlock(&lru_list_lock);
646 wait_on_buffer(bh);
647 atomic_dec(&bh->b_count);
648 goto repeat;
650 if (bh->b_dev == dev && bh->b_size != size) {
651 clear_bit(BH_Dirty, &bh->b_state);
652 clear_bit(BH_Uptodate, &bh->b_state);
653 clear_bit(BH_Req, &bh->b_state);
654 bh->b_flushtime = 0;
656 if (atomic_read(&bh->b_count) == 0) {
657 __remove_from_queues(bh);
658 put_last_free(bh);
661 spin_unlock(&lru_list_lock);
666 * We used to try various strange things. Let's not.
668 static void refill_freelist(int size)
670 if (!grow_buffers(size)) {
671 wakeup_bdflush(1);
672 current->policy |= SCHED_YIELD;
673 schedule();
677 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
679 bh->b_list = BUF_CLEAN;
680 bh->b_flushtime = 0;
681 bh->b_end_io = handler;
682 bh->b_dev_id = dev_id;
685 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
687 mark_buffer_uptodate(bh, uptodate);
688 unlock_buffer(bh);
691 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
693 mark_buffer_uptodate(bh, uptodate);
694 unlock_buffer(bh);
695 BUG();
698 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
700 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
701 unsigned long flags;
702 struct buffer_head *tmp;
703 struct page *page;
704 int free;
706 mark_buffer_uptodate(bh, uptodate);
708 /* This is a temporary buffer used for page I/O. */
709 page = mem_map + MAP_NR(bh->b_data);
711 if (!uptodate)
712 SetPageError(page);
715 * Be _very_ careful from here on. Bad things can happen if
716 * two buffer heads end IO at almost the same time and both
717 * decide that the page is now completely done.
719 * Async buffer_heads are here only as labels for IO, and get
720 * thrown away once the IO for this page is complete. IO is
721 * deemed complete once all buffers have been visited
722 * (b_count==0) and are now unlocked. We must make sure that
723 * only the _last_ buffer that decrements its count is the one
724 * that free's the page..
726 spin_lock_irqsave(&page_uptodate_lock, flags);
727 unlock_buffer(bh);
728 atomic_dec(&bh->b_count);
729 tmp = bh->b_this_page;
730 while (tmp != bh) {
731 if (atomic_read(&tmp->b_count) &&
732 (tmp->b_end_io == end_buffer_io_async))
733 goto still_busy;
734 tmp = tmp->b_this_page;
737 /* OK, the async IO on this page is complete. */
738 spin_unlock_irqrestore(&page_uptodate_lock, flags);
741 * if none of the buffers had errors then we can set the
742 * page uptodate:
744 if (!PageError(page))
745 SetPageUptodate(page);
748 * Run the hooks that have to be done when a page I/O has completed.
750 * Note - we need to test the flags before we unlock the page, but
751 * we must not actually free the page until after the unlock!
753 if (test_and_clear_bit(PG_decr_after, &page->flags))
754 atomic_dec(&nr_async_pages);
756 if (test_and_clear_bit(PG_free_swap_after, &page->flags))
757 swap_free(page->offset);
759 free = test_and_clear_bit(PG_free_after, &page->flags);
761 if (page->owner != (void *)-1)
762 PAGE_BUG(page);
763 page->owner = current;
764 UnlockPage(page);
766 if (free)
767 __free_page(page);
769 return;
771 still_busy:
772 spin_unlock_irqrestore(&page_uptodate_lock, flags);
773 return;
778 * Ok, this is getblk, and it isn't very clear, again to hinder
779 * race-conditions. Most of the code is seldom used, (ie repeating),
780 * so it should be much more efficient than it looks.
782 * The algorithm is changed: hopefully better, and an elusive bug removed.
784 * 14.02.92: changed it to sync dirty buffers a bit: better performance
785 * when the filesystem starts to get full of dirty blocks (I hope).
787 struct buffer_head * getblk(kdev_t dev, int block, int size)
789 struct buffer_head * bh;
790 int isize;
792 repeat:
793 bh = get_hash_table(dev, block, size);
794 if (bh) {
795 if (!buffer_dirty(bh)) {
796 bh->b_flushtime = 0;
798 goto out;
801 isize = BUFSIZE_INDEX(size);
802 spin_lock(&free_list[isize].lock);
803 bh = free_list[isize].list;
804 if (bh) {
805 __remove_from_free_list(bh, isize);
806 atomic_set(&bh->b_count, 1);
808 spin_unlock(&free_list[isize].lock);
809 if (!bh)
810 goto refill;
812 /* OK, FINALLY we know that this buffer is the only one of its kind,
813 * we hold a reference (b_count>0), it is unlocked, and it is clean.
815 init_buffer(bh, end_buffer_io_sync, NULL);
816 bh->b_dev = dev;
817 bh->b_blocknr = block;
818 bh->b_state = 1 << BH_Mapped;
820 /* Insert the buffer into the regular lists */
821 insert_into_queues(bh);
822 goto out;
825 * If we block while refilling the free list, somebody may
826 * create the buffer first ... search the hashes again.
828 refill:
829 refill_freelist(size);
830 goto repeat;
831 out:
832 return bh;
836 * if a new dirty buffer is created we need to balance bdflush.
838 * in the future we might want to make bdflush aware of different
839 * pressures on different devices - thus the (currently unused)
840 * 'dev' parameter.
842 int too_many_dirty_buffers;
844 void balance_dirty(kdev_t dev)
846 int dirty = nr_buffers_type[BUF_DIRTY];
847 int ndirty = bdf_prm.b_un.ndirty;
849 if (dirty > ndirty) {
850 if (dirty > 2*ndirty) {
851 too_many_dirty_buffers = 1;
852 wakeup_bdflush(1);
853 return;
855 wakeup_bdflush(0);
857 too_many_dirty_buffers = 0;
858 return;
861 static inline void __mark_dirty(struct buffer_head *bh, int flag)
863 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
864 clear_bit(BH_New, &bh->b_state);
865 refile_buffer(bh);
868 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
870 __mark_dirty(bh, flag);
874 * A buffer may need to be moved from one buffer list to another
875 * (e.g. in case it is not shared any more). Handle this.
877 static __inline__ void __refile_buffer(struct buffer_head *bh)
879 int dispose = BUF_CLEAN;
880 if (buffer_locked(bh))
881 dispose = BUF_LOCKED;
882 if (buffer_dirty(bh))
883 dispose = BUF_DIRTY;
884 if (dispose != bh->b_list) {
885 __remove_from_lru_list(bh, bh->b_list);
886 bh->b_list = dispose;
887 __insert_into_lru_list(bh, dispose);
891 void refile_buffer(struct buffer_head *bh)
893 spin_lock(&lru_list_lock);
894 __refile_buffer(bh);
895 spin_unlock(&lru_list_lock);
899 * Release a buffer head
901 void __brelse(struct buffer_head * buf)
903 touch_buffer(buf);
905 if (atomic_read(&buf->b_count)) {
906 atomic_dec(&buf->b_count);
907 return;
909 printk("VFS: brelse: Trying to free free buffer\n");
913 * bforget() is like brelse(), except it puts the buffer on the
914 * free list if it can.. We can NOT free the buffer if:
915 * - there are other users of it
916 * - it is locked and thus can have active IO
918 void __bforget(struct buffer_head * buf)
920 spin_lock(&lru_list_lock);
921 write_lock(&hash_table_lock);
922 if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
923 touch_buffer(buf);
924 atomic_dec(&buf->b_count);
925 } else {
926 atomic_set(&buf->b_count, 0);
927 buf->b_state = 0;
928 if (buf->b_pprev)
929 __hash_unlink(buf);
930 __remove_from_lru_list(buf, buf->b_list);
931 put_last_free(buf);
933 write_unlock(&hash_table_lock);
934 spin_unlock(&lru_list_lock);
938 * bread() reads a specified block and returns the buffer that contains
939 * it. It returns NULL if the block was unreadable.
941 struct buffer_head * bread(kdev_t dev, int block, int size)
943 struct buffer_head * bh;
945 bh = getblk(dev, block, size);
946 if (buffer_uptodate(bh))
947 return bh;
948 ll_rw_block(READ, 1, &bh);
949 wait_on_buffer(bh);
950 if (buffer_uptodate(bh))
951 return bh;
952 brelse(bh);
953 return NULL;
957 * Ok, breada can be used as bread, but additionally to mark other
958 * blocks for reading as well. End the argument list with a negative
959 * number.
962 #define NBUF 16
964 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
965 unsigned int pos, unsigned int filesize)
967 struct buffer_head * bhlist[NBUF];
968 unsigned int blocks;
969 struct buffer_head * bh;
970 int index;
971 int i, j;
973 if (pos >= filesize)
974 return NULL;
976 if (block < 0)
977 return NULL;
979 bh = getblk(dev, block, bufsize);
980 index = BUFSIZE_INDEX(bh->b_size);
982 if (buffer_uptodate(bh))
983 return(bh);
984 else ll_rw_block(READ, 1, &bh);
986 blocks = (filesize - pos) >> (9+index);
988 if (blocks < (read_ahead[MAJOR(dev)] >> index))
989 blocks = read_ahead[MAJOR(dev)] >> index;
990 if (blocks > NBUF)
991 blocks = NBUF;
993 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
995 bhlist[0] = bh;
996 j = 1;
997 for(i=1; i<blocks; i++) {
998 bh = getblk(dev,block+i,bufsize);
999 if (buffer_uptodate(bh)) {
1000 brelse(bh);
1001 break;
1003 else bhlist[j++] = bh;
1006 /* Request the read for these buffers, and then release them. */
1007 if (j>1)
1008 ll_rw_block(READA, (j-1), bhlist+1);
1009 for(i=1; i<j; i++)
1010 brelse(bhlist[i]);
1012 /* Wait for this buffer, and then continue on. */
1013 bh = bhlist[0];
1014 wait_on_buffer(bh);
1015 if (buffer_uptodate(bh))
1016 return bh;
1017 brelse(bh);
1018 return NULL;
1022 * Note: the caller should wake up the buffer_wait list if needed.
1024 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1026 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1027 kmem_cache_free(bh_cachep, bh);
1028 } else {
1029 bh->b_blocknr = -1;
1030 init_waitqueue_head(&bh->b_wait);
1031 nr_unused_buffer_heads++;
1032 bh->b_next_free = unused_list;
1033 bh->b_this_page = NULL;
1034 unused_list = bh;
1038 static void put_unused_buffer_head(struct buffer_head *bh)
1040 spin_lock(&unused_list_lock);
1041 __put_unused_buffer_head(bh);
1042 spin_unlock(&unused_list_lock);
1046 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1047 * no-buffer-head deadlock. Return NULL on failure; waiting for
1048 * buffer heads is now handled in create_buffers().
1050 static struct buffer_head * get_unused_buffer_head(int async)
1052 struct buffer_head * bh;
1054 spin_lock(&unused_list_lock);
1055 if (nr_unused_buffer_heads > NR_RESERVED) {
1056 bh = unused_list;
1057 unused_list = bh->b_next_free;
1058 nr_unused_buffer_heads--;
1059 spin_unlock(&unused_list_lock);
1060 return bh;
1062 spin_unlock(&unused_list_lock);
1064 /* This is critical. We can't swap out pages to get
1065 * more buffer heads, because the swap-out may need
1066 * more buffer-heads itself. Thus SLAB_BUFFER.
1068 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1069 memset(bh, 0, sizeof(*bh));
1070 init_waitqueue_head(&bh->b_wait);
1071 return bh;
1075 * If we need an async buffer, use the reserved buffer heads.
1077 if (async) {
1078 spin_lock(&unused_list_lock);
1079 if (unused_list) {
1080 bh = unused_list;
1081 unused_list = bh->b_next_free;
1082 nr_unused_buffer_heads--;
1083 spin_unlock(&unused_list_lock);
1084 return bh;
1086 spin_unlock(&unused_list_lock);
1088 #if 0
1090 * (Pending further analysis ...)
1091 * Ordinary (non-async) requests can use a different memory priority
1092 * to free up pages. Any swapping thus generated will use async
1093 * buffer heads.
1095 if(!async &&
1096 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1097 memset(bh, 0, sizeof(*bh));
1098 init_waitqueue_head(&bh->b_wait);
1099 return bh;
1101 #endif
1103 return NULL;
1107 * Create the appropriate buffers when given a page for data area and
1108 * the size of each buffer.. Use the bh->b_this_page linked list to
1109 * follow the buffers created. Return NULL if unable to create more
1110 * buffers.
1111 * The async flag is used to differentiate async IO (paging, swapping)
1112 * from ordinary buffer allocations, and only async requests are allowed
1113 * to sleep waiting for buffer heads.
1115 static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
1117 DECLARE_WAITQUEUE(wait, current);
1118 struct buffer_head *bh, *head;
1119 long offset;
1121 try_again:
1122 head = NULL;
1123 offset = PAGE_SIZE;
1124 while ((offset -= size) >= 0) {
1125 bh = get_unused_buffer_head(async);
1126 if (!bh)
1127 goto no_grow;
1129 bh->b_dev = B_FREE; /* Flag as unused */
1130 bh->b_this_page = head;
1131 head = bh;
1133 bh->b_state = 0;
1134 bh->b_next_free = NULL;
1135 bh->b_pprev = NULL;
1136 atomic_set(&bh->b_count, 0);
1137 bh->b_size = size;
1139 bh->b_data = (char *) (page+offset);
1140 bh->b_list = BUF_CLEAN;
1141 bh->b_flushtime = 0;
1142 bh->b_end_io = end_buffer_io_bad;
1144 return head;
1146 * In case anything failed, we just free everything we got.
1148 no_grow:
1149 if (head) {
1150 do {
1151 bh = head;
1152 head = head->b_this_page;
1153 put_unused_buffer_head(bh);
1154 } while (head);
1156 /* Wake up any waiters ... */
1157 wake_up(&buffer_wait);
1161 * Return failure for non-async IO requests. Async IO requests
1162 * are not allowed to fail, so we have to wait until buffer heads
1163 * become available. But we don't want tasks sleeping with
1164 * partially complete buffers, so all were released above.
1166 if (!async)
1167 return NULL;
1169 /* We're _really_ low on memory. Now we just
1170 * wait for old buffer heads to become free due to
1171 * finishing IO. Since this is an async request and
1172 * the reserve list is empty, we're sure there are
1173 * async buffer heads in use.
1175 run_task_queue(&tq_disk);
1178 * Set our state for sleeping, then check again for buffer heads.
1179 * This ensures we won't miss a wake_up from an interrupt.
1181 add_wait_queue(&buffer_wait, &wait);
1182 current->state = TASK_UNINTERRUPTIBLE;
1183 if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1184 current->policy |= SCHED_YIELD;
1185 schedule();
1187 remove_wait_queue(&buffer_wait, &wait);
1188 current->state = TASK_RUNNING;
1189 goto try_again;
1192 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1194 struct buffer_head *head, *bh, *tail;
1195 int block;
1197 if (!PageLocked(page))
1198 BUG();
1199 if (page->owner != current)
1200 PAGE_BUG(page);
1202 * Allocate async buffer heads pointing to this page, just for I/O.
1203 * They show up in the buffer hash table and are registered in
1204 * page->buffers.
1206 head = create_buffers(page_address(page), size, 1);
1207 if (page->buffers)
1208 BUG();
1209 if (!head)
1210 BUG();
1211 tail = head;
1212 for (bh = head; bh; bh = bh->b_this_page) {
1213 block = *(b++);
1215 tail = bh;
1216 init_buffer(bh, end_buffer_io_async, NULL);
1217 bh->b_dev = dev;
1218 bh->b_blocknr = block;
1221 * When we use bmap, we define block zero to represent
1222 * a hole. ll_rw_page, however, may legitimately
1223 * access block zero, and we need to distinguish the
1224 * two cases.
1226 if (bmap && !block) {
1227 memset(bh->b_data, 0, size);
1228 set_bit(BH_Uptodate, &bh->b_state);
1229 continue;
1231 set_bit(BH_Mapped, &bh->b_state);
1233 tail->b_this_page = head;
1234 get_page(page);
1235 page->buffers = head;
1236 return 0;
1240 * We don't have to release all buffers here, but
1241 * we have to be sure that no dirty buffer is left
1242 * and no IO is going on (no buffer is locked), because
1243 * we have truncated the file and are going to free the
1244 * blocks on-disk..
1246 int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1248 struct buffer_head *head, *bh, *next;
1249 unsigned int curr_off = 0;
1251 if (!PageLocked(page))
1252 BUG();
1253 if (!page->buffers)
1254 return 0;
1256 head = page->buffers;
1257 bh = head;
1258 do {
1259 unsigned int next_off = curr_off + bh->b_size;
1260 next = bh->b_this_page;
1263 * is this block fully flushed?
1265 if (offset <= curr_off) {
1266 if (buffer_mapped(bh)) {
1267 atomic_inc(&bh->b_count);
1268 wait_on_buffer(bh);
1269 if (bh->b_dev == B_FREE)
1270 BUG();
1271 mark_buffer_clean(bh);
1272 clear_bit(BH_Uptodate, &bh->b_state);
1273 clear_bit(BH_Mapped, &bh->b_state);
1274 clear_bit(BH_Req, &bh->b_state);
1275 bh->b_blocknr = 0;
1276 atomic_dec(&bh->b_count);
1279 curr_off = next_off;
1280 bh = next;
1281 } while (bh != head);
1284 * subtle. We release buffer-heads only if this is
1285 * the 'final' flushpage. We have invalidated the bmap
1286 * cached value unconditionally, so real IO is not
1287 * possible anymore.
1289 * If the free doesn't work out, the buffers can be
1290 * left around - they just turn into anonymous buffers
1291 * instead.
1293 if (!offset) {
1294 if (!try_to_free_buffers(page))
1295 atomic_add(PAGE_CACHE_SIZE, &buffermem);
1298 return 0;
1301 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1303 struct buffer_head *bh, *head, *tail;
1305 head = create_buffers(page_address(page), blocksize, 1);
1306 if (page->buffers)
1307 BUG();
1309 bh = head;
1310 do {
1311 bh->b_dev = inode->i_dev;
1312 bh->b_blocknr = 0;
1313 bh->b_end_io = end_buffer_io_bad;
1314 tail = bh;
1315 bh = bh->b_this_page;
1316 } while (bh);
1317 tail->b_this_page = head;
1318 page->buffers = head;
1319 get_page(page);
1323 * block_write_full_page() is SMP-safe - currently it's still
1324 * being called with the kernel lock held, but the code is ready.
1326 int block_write_full_page(struct file *file, struct page *page)
1328 struct dentry *dentry = file->f_dentry;
1329 struct inode *inode = dentry->d_inode;
1330 int err, i;
1331 unsigned long block, offset;
1332 struct buffer_head *bh, *head;
1334 if (!PageLocked(page))
1335 BUG();
1337 if (!page->buffers)
1338 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1339 head = page->buffers;
1341 offset = page->offset;
1342 block = offset >> inode->i_sb->s_blocksize_bits;
1344 // FIXME: currently we assume page alignment.
1345 if (offset & (PAGE_SIZE-1))
1346 BUG();
1348 bh = head;
1349 i = 0;
1350 do {
1351 if (!bh)
1352 BUG();
1355 * If the buffer isn't up-to-date, we can't be sure
1356 * that the buffer has been initialized with the proper
1357 * block number information etc..
1359 * Leave it to the low-level FS to make all those
1360 * decisions (block #0 may actually be a valid block)
1362 bh->b_end_io = end_buffer_io_sync;
1363 if (!buffer_mapped(bh)) {
1364 err = inode->i_op->get_block(inode, block, bh, 1);
1365 if (err)
1366 goto out;
1368 set_bit(BH_Uptodate, &bh->b_state);
1369 mark_buffer_dirty(bh,0);
1371 bh = bh->b_this_page;
1372 block++;
1373 } while (bh != head);
1375 SetPageUptodate(page);
1376 return 0;
1377 out:
1378 ClearPageUptodate(page);
1379 return err;
1382 int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
1384 struct dentry *dentry = file->f_dentry;
1385 struct inode *inode = dentry->d_inode;
1386 unsigned long block;
1387 int err, partial;
1388 unsigned long blocksize, start_block, end_block;
1389 unsigned long start_offset, start_bytes, end_bytes;
1390 unsigned long bbits, blocks, i, len;
1391 struct buffer_head *bh, *head;
1392 char * target_buf;
1394 target_buf = (char *)page_address(page) + offset;
1396 if (!PageLocked(page))
1397 BUG();
1399 blocksize = inode->i_sb->s_blocksize;
1400 if (!page->buffers)
1401 create_empty_buffers(page, inode, blocksize);
1402 head = page->buffers;
1404 bbits = inode->i_sb->s_blocksize_bits;
1405 block = page->offset >> bbits;
1406 blocks = PAGE_SIZE >> bbits;
1407 start_block = offset >> bbits;
1408 end_block = (offset + bytes - 1) >> bbits;
1409 start_offset = offset & (blocksize - 1);
1410 start_bytes = blocksize - start_offset;
1411 if (start_bytes > bytes)
1412 start_bytes = bytes;
1413 end_bytes = (offset+bytes) & (blocksize - 1);
1414 if (end_bytes > bytes)
1415 end_bytes = bytes;
1417 if (offset < 0 || offset >= PAGE_SIZE)
1418 BUG();
1419 if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1420 BUG();
1421 if (start_block < 0 || start_block >= blocks)
1422 BUG();
1423 if (end_block < 0 || end_block >= blocks)
1424 BUG();
1425 // FIXME: currently we assume page alignment.
1426 if (page->offset & (PAGE_SIZE-1))
1427 BUG();
1429 i = 0;
1430 bh = head;
1431 partial = 0;
1432 do {
1433 if (!bh)
1434 BUG();
1436 if ((i < start_block) || (i > end_block)) {
1437 if (!buffer_uptodate(bh))
1438 partial = 1;
1439 goto skip;
1443 * If the buffer is not up-to-date, we need to ask the low-level
1444 * FS to do something for us (we used to have assumptions about
1445 * the meaning of b_blocknr etc, that's bad).
1447 * If "update" is set, that means that the low-level FS should
1448 * try to make sure that the block is up-to-date because we're
1449 * not going to fill it completely.
1451 bh->b_end_io = end_buffer_io_sync;
1452 if (!buffer_mapped(bh)) {
1453 err = inode->i_op->get_block(inode, block, bh, 1);
1454 if (err)
1455 goto out;
1458 if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1459 if (buffer_new(bh)) {
1460 memset(bh->b_data, 0, bh->b_size);
1461 } else {
1462 ll_rw_block(READ, 1, &bh);
1463 wait_on_buffer(bh);
1464 err = -EIO;
1465 if (!buffer_uptodate(bh))
1466 goto out;
1470 len = blocksize;
1471 if (start_offset) {
1472 len = start_bytes;
1473 start_offset = 0;
1474 } else if (end_bytes && (i == end_block)) {
1475 len = end_bytes;
1476 end_bytes = 0;
1478 err = copy_from_user(target_buf, buf, len);
1479 target_buf += len;
1480 buf += len;
1483 * we dirty buffers only after copying the data into
1484 * the page - this way we can dirty the buffer even if
1485 * the bh is still doing IO.
1487 * NOTE! This also does a direct dirty balace check,
1488 * rather than relying on bdflush just waking up every
1489 * once in a while. This is to catch (and slow down)
1490 * the processes that write tons of buffer..
1492 * Note how we do NOT want to do this in the full block
1493 * case: full pages are flushed not by the people who
1494 * dirtied them, but by people who need memory. And we
1495 * should not penalize them for somebody else writing
1496 * lots of dirty pages.
1498 set_bit(BH_Uptodate, &bh->b_state);
1499 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1500 __mark_dirty(bh, 0);
1501 if (too_many_dirty_buffers)
1502 balance_dirty(bh->b_dev);
1505 if (err) {
1506 err = -EFAULT;
1507 goto out;
1510 skip:
1511 i++;
1512 block++;
1513 bh = bh->b_this_page;
1514 } while (bh != head);
1517 * is this a partial write that happened to make all buffers
1518 * uptodate then we can optimize away a bogus readpage() for
1519 * the next read(). Here we 'discover' wether the page went
1520 * uptodate as a result of this (potentially partial) write.
1522 if (!partial)
1523 SetPageUptodate(page);
1524 return bytes;
1525 out:
1526 ClearPageUptodate(page);
1527 return err;
1531 * Start I/O on a page.
1532 * This function expects the page to be locked and may return
1533 * before I/O is complete. You then have to check page->locked,
1534 * page->uptodate, and maybe wait on page->wait.
1536 * brw_page() is SMP-safe, although it's being called with the
1537 * kernel lock held - but the code is ready.
1539 * FIXME: we need a swapper_inode->get_block function to remove
1540 * some of the bmap kludges and interface ugliness here.
1542 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1544 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1545 int nr, fresh /* temporary debugging flag */, block;
1547 if (!PageLocked(page))
1548 panic("brw_page: page not locked for I/O");
1549 // clear_bit(PG_error, &page->flags);
1551 * We pretty much rely on the page lock for this, because
1552 * create_page_buffers() might sleep.
1554 fresh = 0;
1555 if (!page->buffers) {
1556 create_page_buffers(rw, page, dev, b, size, bmap);
1557 fresh = 1;
1559 if (!page->buffers)
1560 BUG();
1561 page->owner = (void *)-1;
1563 head = page->buffers;
1564 bh = head;
1565 nr = 0;
1566 do {
1567 block = *(b++);
1569 if (fresh && (atomic_read(&bh->b_count) != 0))
1570 BUG();
1571 if (rw == READ) {
1572 if (!fresh)
1573 BUG();
1574 if (bmap && !block) {
1575 if (block)
1576 BUG();
1577 } else {
1578 if (bmap && !block)
1579 BUG();
1580 if (!buffer_uptodate(bh)) {
1581 arr[nr++] = bh;
1582 atomic_inc(&bh->b_count);
1585 } else { /* WRITE */
1586 if (!bh->b_blocknr) {
1587 if (!block)
1588 BUG();
1589 bh->b_blocknr = block;
1590 } else {
1591 if (!block)
1592 BUG();
1594 set_bit(BH_Uptodate, &bh->b_state);
1595 set_bit(BH_Dirty, &bh->b_state);
1596 arr[nr++] = bh;
1597 atomic_inc(&bh->b_count);
1599 bh = bh->b_this_page;
1600 } while (bh != head);
1601 if (rw == READ)
1602 ++current->maj_flt;
1603 if ((rw == READ) && nr) {
1604 if (Page_Uptodate(page))
1605 BUG();
1606 ll_rw_block(rw, nr, arr);
1607 } else {
1608 if (!nr && rw == READ) {
1609 SetPageUptodate(page);
1610 page->owner = current;
1611 UnlockPage(page);
1613 if (nr && (rw == WRITE))
1614 ll_rw_block(rw, nr, arr);
1616 return 0;
1620 * Generic "read page" function for block devices that have the normal
1621 * bmap functionality. This is most of the block device filesystems.
1622 * Reads the page asynchronously --- the unlock_buffer() and
1623 * mark_buffer_uptodate() functions propagate buffer state into the
1624 * page struct once IO has completed.
1626 int block_read_full_page(struct file * file, struct page * page)
1628 struct dentry *dentry = file->f_dentry;
1629 struct inode *inode = dentry->d_inode;
1630 unsigned long iblock;
1631 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1632 unsigned int blocksize, blocks;
1633 int nr;
1635 if (!PageLocked(page))
1636 PAGE_BUG(page);
1637 blocksize = inode->i_sb->s_blocksize;
1638 if (!page->buffers)
1639 create_empty_buffers(page, inode, blocksize);
1640 head = page->buffers;
1642 blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1643 iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1644 page->owner = (void *)-1;
1645 head = page->buffers;
1646 bh = head;
1647 nr = 0;
1649 do {
1650 if (buffer_uptodate(bh))
1651 continue;
1653 if (!buffer_mapped(bh)) {
1654 inode->i_op->get_block(inode, iblock, bh, 0);
1655 if (!buffer_mapped(bh)) {
1656 memset(bh->b_data, 0, blocksize);
1657 set_bit(BH_Uptodate, &bh->b_state);
1658 continue;
1662 init_buffer(bh, end_buffer_io_async, NULL);
1663 atomic_inc(&bh->b_count);
1664 arr[nr] = bh;
1665 nr++;
1666 } while (iblock++, (bh = bh->b_this_page) != head);
1668 ++current->maj_flt;
1669 if (nr) {
1670 if (Page_Uptodate(page))
1671 BUG();
1672 ll_rw_block(READ, nr, arr);
1673 } else {
1675 * all buffers are uptodate - we can set the page
1676 * uptodate as well.
1678 SetPageUptodate(page);
1679 page->owner = current;
1680 UnlockPage(page);
1682 return 0;
1686 * Try to increase the number of buffers available: the size argument
1687 * is used to determine what kind of buffers we want.
1689 static int grow_buffers(int size)
1691 unsigned long page;
1692 struct buffer_head *bh, *tmp;
1693 struct buffer_head * insert_point;
1694 int isize;
1696 if ((size & 511) || (size > PAGE_SIZE)) {
1697 printk("VFS: grow_buffers: size = %d\n",size);
1698 return 0;
1701 if (!(page = __get_free_page(GFP_BUFFER)))
1702 return 0;
1703 bh = create_buffers(page, size, 0);
1704 if (!bh) {
1705 free_page(page);
1706 return 0;
1709 isize = BUFSIZE_INDEX(size);
1711 spin_lock(&free_list[isize].lock);
1712 insert_point = free_list[isize].list;
1713 tmp = bh;
1714 while (1) {
1715 if (insert_point) {
1716 tmp->b_next_free = insert_point->b_next_free;
1717 tmp->b_prev_free = insert_point;
1718 insert_point->b_next_free->b_prev_free = tmp;
1719 insert_point->b_next_free = tmp;
1720 } else {
1721 tmp->b_prev_free = tmp;
1722 tmp->b_next_free = tmp;
1724 insert_point = tmp;
1725 if (tmp->b_this_page)
1726 tmp = tmp->b_this_page;
1727 else
1728 break;
1730 tmp->b_this_page = bh;
1731 free_list[isize].list = bh;
1732 spin_unlock(&free_list[isize].lock);
1734 mem_map[MAP_NR(page)].buffers = bh;
1735 atomic_add(PAGE_SIZE, &buffermem);
1736 return 1;
1740 * Can the buffer be thrown out?
1742 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1743 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1746 * try_to_free_buffers() checks if all the buffers on this particular page
1747 * are unused, and free's the page if so.
1749 * Wake up bdflush() if this fails - if we're running low on memory due
1750 * to dirty buffers, we need to flush them out as quickly as possible.
1752 * NOTE: There are quite a number of ways that threads of control can
1753 * obtain a reference to a buffer head within a page. So we must
1754 * lock out all of these paths to cleanly toss the page.
1756 int try_to_free_buffers(struct page * page)
1758 struct buffer_head * tmp, * bh = page->buffers;
1759 int index = BUFSIZE_INDEX(bh->b_size);
1760 int ret;
1762 spin_lock(&lru_list_lock);
1763 write_lock(&hash_table_lock);
1764 spin_lock(&free_list[index].lock);
1765 tmp = bh;
1766 do {
1767 struct buffer_head * p = tmp;
1769 tmp = tmp->b_this_page;
1770 if (buffer_busy(p))
1771 goto busy_buffer_page;
1772 } while (tmp != bh);
1774 spin_lock(&unused_list_lock);
1775 tmp = bh;
1776 do {
1777 struct buffer_head * p = tmp;
1778 tmp = tmp->b_this_page;
1780 /* The buffer can be either on the regular
1781 * queues or on the free list..
1783 if (p->b_dev == B_FREE) {
1784 __remove_from_free_list(p, index);
1785 } else {
1786 if (p->b_pprev)
1787 __hash_unlink(p);
1788 __remove_from_lru_list(p, p->b_list);
1790 __put_unused_buffer_head(p);
1791 } while (tmp != bh);
1792 spin_unlock(&unused_list_lock);
1794 /* Wake up anyone waiting for buffer heads */
1795 wake_up(&buffer_wait);
1797 /* And free the page */
1798 page->buffers = NULL;
1799 __free_page(page);
1800 ret = 1;
1801 out:
1802 spin_unlock(&free_list[index].lock);
1803 write_unlock(&hash_table_lock);
1804 spin_unlock(&lru_list_lock);
1805 return ret;
1807 busy_buffer_page:
1808 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
1809 too_many_dirty_buffers = 1;
1810 wakeup_bdflush(0);
1811 ret = 0;
1812 goto out;
1815 /* ===================== Init ======================= */
1818 * allocate the hash table and init the free list
1819 * Use gfp() for the hash table to decrease TLB misses, use
1820 * SLAB cache for buffer heads.
1822 void __init buffer_init(unsigned long memory_size)
1824 int order, i;
1825 unsigned int nr_hash;
1827 /* The buffer cache hash table is less important these days,
1828 * trim it a bit.
1830 memory_size >>= 14;
1831 memory_size *= sizeof(struct buffer_head *);
1832 for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
1835 /* try to allocate something until we get it or we're asking
1836 for something that is really too small */
1838 do {
1839 unsigned long tmp;
1841 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
1842 bh_hash_mask = (nr_hash - 1);
1844 tmp = nr_hash;
1845 bh_hash_shift = 0;
1846 while((tmp >>= 1UL) != 0UL)
1847 bh_hash_shift++;
1849 hash_table = (struct buffer_head **)
1850 __get_free_pages(GFP_ATOMIC, order);
1851 } while (hash_table == NULL && --order > 0);
1852 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
1853 nr_hash, order, (1UL<<order) * PAGE_SIZE);
1855 if (!hash_table)
1856 panic("Failed to allocate buffer hash table\n");
1858 /* Setup hash chains. */
1859 for(i = 0; i < nr_hash; i++)
1860 hash_table[i] = NULL;
1862 /* Setup free lists. */
1863 for(i = 0; i < NR_SIZES; i++) {
1864 free_list[i].list = NULL;
1865 free_list[i].lock = SPIN_LOCK_UNLOCKED;
1868 /* Setup lru lists. */
1869 for(i = 0; i < NR_LIST; i++)
1870 lru_list[i] = NULL;
1872 bh_cachep = kmem_cache_create("buffer_head",
1873 sizeof(struct buffer_head),
1875 SLAB_HWCACHE_ALIGN, NULL, NULL);
1876 if(!bh_cachep)
1877 panic("Cannot create buffer head SLAB cache\n");
1881 /* ====================== bdflush support =================== */
1883 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1884 * response to dirty buffers. Once this process is activated, we write back
1885 * a limited number of buffers to the disks and then go back to sleep again.
1887 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1888 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1889 struct task_struct *bdflush_tsk = 0;
1891 void wakeup_bdflush(int wait)
1893 if (current == bdflush_tsk)
1894 return;
1895 if (wait)
1896 run_task_queue(&tq_disk);
1897 wake_up(&bdflush_wait);
1898 if (wait)
1899 sleep_on(&bdflush_done);
1904 * Here we attempt to write back old buffers. We also try to flush inodes
1905 * and supers as well, since this function is essentially "update", and
1906 * otherwise there would be no way of ensuring that these quantities ever
1907 * get written back. Ideally, we would have a timestamp on the inodes
1908 * and superblocks so that we could write back only the old ones as well
1911 static int sync_old_buffers(void)
1913 int nlist;
1915 lock_kernel();
1916 sync_supers(0);
1917 sync_inodes(0);
1918 unlock_kernel();
1920 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
1921 struct buffer_head *bh;
1922 repeat:
1923 spin_lock(&lru_list_lock);
1924 bh = lru_list[nlist];
1925 if(bh) {
1926 struct buffer_head *next;
1927 int i;
1928 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1929 next = bh->b_next_free;
1931 /* If the buffer is not on the proper list,
1932 * then refile it.
1934 if ((nlist == BUF_DIRTY &&
1935 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
1936 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
1937 __refile_buffer(bh);
1938 continue;
1941 if (buffer_locked(bh) || !buffer_dirty(bh))
1942 continue;
1944 /* OK, now we are committed to write it out. */
1945 bh->b_flushtime = 0;
1946 atomic_inc(&bh->b_count);
1947 spin_unlock(&lru_list_lock);
1948 ll_rw_block(WRITE, 1, &bh);
1949 atomic_dec(&bh->b_count);
1950 goto repeat;
1953 spin_unlock(&lru_list_lock);
1955 run_task_queue(&tq_disk);
1956 return 0;
1959 struct mm_struct * start_lazy_tlb(void)
1961 struct mm_struct *mm = current->mm;
1962 atomic_inc(&mm->mm_count);
1963 current->mm = NULL;
1964 /* active_mm is still 'mm' */
1965 return mm;
1968 void end_lazy_tlb(struct mm_struct *mm)
1970 struct mm_struct *active_mm = current->active_mm;
1972 current->mm = mm;
1973 if (mm != active_mm) {
1974 current->active_mm = mm;
1975 activate_context();
1977 mmdrop(active_mm);
1980 /* This is the interface to bdflush. As we get more sophisticated, we can
1981 * pass tuning parameters to this "process", to adjust how it behaves.
1982 * We would want to verify each parameter, however, to make sure that it
1983 * is reasonable. */
1985 asmlinkage int sys_bdflush(int func, long data)
1987 if (!capable(CAP_SYS_ADMIN))
1988 return -EPERM;
1990 if (func == 1) {
1991 int error;
1992 struct mm_struct *user_mm;
1995 * bdflush will spend all of it's time in kernel-space,
1996 * without touching user-space, so we can switch it into
1997 * 'lazy TLB mode' to reduce the cost of context-switches
1998 * to and from bdflush.
2000 user_mm = start_lazy_tlb();
2001 error = sync_old_buffers();
2002 end_lazy_tlb(user_mm);
2003 return error;
2006 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2007 if (func >= 2) {
2008 int i = (func-2) >> 1;
2009 if (i >= 0 && i < N_PARAM) {
2010 if ((func & 1) == 0)
2011 return put_user(bdf_prm.data[i], (int*)data);
2013 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2014 bdf_prm.data[i] = data;
2015 return 0;
2018 return -EINVAL;
2021 /* Having func 0 used to launch the actual bdflush and then never
2022 * return (unless explicitly killed). We return zero here to
2023 * remain semi-compatible with present update(8) programs.
2025 return 0;
2029 * This is the actual bdflush daemon itself. It used to be started from
2030 * the syscall above, but now we launch it ourselves internally with
2031 * kernel_thread(...) directly after the first thread in init/main.c
2033 int bdflush(void * unused)
2036 * We have a bare-bones task_struct, and really should fill
2037 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2038 * display semi-sane things. Not real crucial though...
2041 current->session = 1;
2042 current->pgrp = 1;
2043 sprintf(current->comm, "kflushd");
2044 bdflush_tsk = current;
2046 for (;;) {
2047 int nlist;
2049 CHECK_EMERGENCY_SYNC
2051 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2052 int nr, major, written = 0;
2053 struct buffer_head *next;
2055 repeat:
2056 spin_lock(&lru_list_lock);
2057 next = lru_list[nlist];
2058 nr = nr_buffers_type[nlist];
2059 while (nr-- > 0) {
2060 struct buffer_head *bh = next;
2062 next = next->b_next_free;
2064 /* If the buffer is not on the correct list,
2065 * then refile it.
2067 if ((nlist == BUF_DIRTY &&
2068 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2069 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2070 __refile_buffer(bh);
2071 continue;
2074 /* If we aren't in panic mode, don't write out too much
2075 * at a time. Also, don't write out buffers we don't
2076 * really have to write out yet..
2078 if (!too_many_dirty_buffers) {
2079 if (written > bdf_prm.b_un.ndirty)
2080 break;
2081 if (time_before(jiffies, bh->b_flushtime))
2082 continue;
2085 if (buffer_locked(bh) || !buffer_dirty(bh))
2086 continue;
2088 major = MAJOR(bh->b_dev);
2089 written++;
2090 bh->b_flushtime = 0;
2093 * For the loop major we can try to do asynchronous writes,
2094 * but we have to guarantee that we're making some progress..
2096 atomic_inc(&bh->b_count);
2097 spin_unlock(&lru_list_lock);
2098 if (major == LOOP_MAJOR && written > 1) {
2099 ll_rw_block(WRITEA, 1, &bh);
2100 if (buffer_dirty(bh))
2101 --written;
2102 } else
2103 ll_rw_block(WRITE, 1, &bh);
2104 atomic_dec(&bh->b_count);
2105 goto repeat;
2107 spin_unlock(&lru_list_lock);
2109 run_task_queue(&tq_disk);
2110 wake_up(&bdflush_done);
2113 * If there are still a lot of dirty buffers around,
2114 * skip the sleep and flush some more. Otherwise, we
2115 * sleep for a while and mark us as not being in panic
2116 * mode..
2118 if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2119 too_many_dirty_buffers = 0;
2120 spin_lock_irq(&current->sigmask_lock);
2121 flush_signals(current);
2122 spin_unlock_irq(&current->sigmask_lock);
2123 interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);