Import 2.3.10pre1
[davej-history.git] / fs / buffer.c
blob0ddd121ab7463bda70d2f3c2f199fd96205234b7
1 /*
2 * linux/fs/buffer.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
27 /* Thread it... -DaveM */
29 #include <linux/sched.h>
30 #include <linux/fs.h>
31 #include <linux/malloc.h>
32 #include <linux/locks.h>
33 #include <linux/errno.h>
34 #include <linux/swap.h>
35 #include <linux/swapctl.h>
36 #include <linux/smp_lock.h>
37 #include <linux/vmalloc.h>
38 #include <linux/blkdev.h>
39 #include <linux/sysrq.h>
40 #include <linux/file.h>
41 #include <linux/init.h>
42 #include <linux/quotaops.h>
44 #include <asm/uaccess.h>
45 #include <asm/io.h>
46 #include <asm/bitops.h>
48 #define NR_SIZES 7
49 static char buffersize_index[65] =
50 {-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
51 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
52 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
53 -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
54 6};
56 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
57 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
58 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
59 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
60 number of unused buffer heads */
62 /* Anti-deadlock ordering:
63 * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
67 * Hash table gook..
69 static unsigned int bh_hash_mask = 0;
70 static unsigned int bh_hash_shift = 0;
71 static struct buffer_head **hash_table;
72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
74 static struct buffer_head *lru_list[NR_LIST];
75 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
76 static int nr_buffers_type[NR_LIST] = {0,};
78 static struct buffer_head * unused_list = NULL;
79 static int nr_unused_buffer_heads = 0;
80 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
81 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
83 struct bh_free_head {
84 struct buffer_head *list;
85 spinlock_t lock;
87 static struct bh_free_head free_list[NR_SIZES];
89 static kmem_cache_t *bh_cachep;
91 static int grow_buffers(int size);
93 /* This is used by some architectures to estimate available memory. */
94 atomic_t buffermem = ATOMIC_INIT(0);
96 /* Here is the parameter block for the bdflush process. If you add or
97 * remove any of the parameters, make sure to update kernel/sysctl.c.
100 #define N_PARAM 9
102 /* The dummy values in this structure are left in there for compatibility
103 * with old programs that play with the /proc entries.
105 union bdflush_param {
106 struct {
107 int nfract; /* Percentage of buffer cache dirty to
108 activate bdflush */
109 int ndirty; /* Maximum number of dirty blocks to write out per
110 wake-cycle */
111 int nrefill; /* Number of clean buffers to try to obtain
112 each time we call refill */
113 int nref_dirt; /* Dirty buffer threshold for activating bdflush
114 when trying to refill buffers. */
115 int dummy1; /* unused */
116 int age_buffer; /* Time for normal buffer to age before we flush it */
117 int age_super; /* Time for superblock to age before we flush it */
118 int dummy2; /* unused */
119 int dummy3; /* unused */
120 } b_un;
121 unsigned int data[N_PARAM];
122 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
124 /* These are the min and max parameter values that we will allow to be assigned */
125 int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1};
126 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
128 void wakeup_bdflush(int);
131 * Rewrote the wait-routines to use the "new" wait-queue functionality,
132 * and getting rid of the cli-sti pairs. The wait-queue routines still
133 * need cli-sti, but now it's just a couple of 386 instructions or so.
135 * Note that the real wait_on_buffer() is an inline function that checks
136 * if 'b_wait' is set before calling this, so that the queues aren't set
137 * up unnecessarily.
139 void __wait_on_buffer(struct buffer_head * bh)
141 struct task_struct *tsk = current;
142 DECLARE_WAITQUEUE(wait, tsk);
144 atomic_inc(&bh->b_count);
145 add_wait_queue(&bh->b_wait, &wait);
146 repeat:
147 tsk->state = TASK_UNINTERRUPTIBLE;
148 run_task_queue(&tq_disk);
149 if (buffer_locked(bh)) {
150 schedule();
151 goto repeat;
153 tsk->state = TASK_RUNNING;
154 remove_wait_queue(&bh->b_wait, &wait);
155 atomic_dec(&bh->b_count);
158 /* Call sync_buffers with wait!=0 to ensure that the call does not
159 * return until all buffer writes have completed. Sync() may return
160 * before the writes have finished; fsync() may not.
163 /* Godamity-damn. Some buffers (bitmaps for filesystems)
164 * spontaneously dirty themselves without ever brelse being called.
165 * We will ultimately want to put these in a separate list, but for
166 * now we search all of the lists for dirty buffers.
168 static int sync_buffers(kdev_t dev, int wait)
170 int i, retry, pass = 0, err = 0;
171 struct buffer_head * bh, *next;
173 /* One pass for no-wait, three for wait:
174 * 0) write out all dirty, unlocked buffers;
175 * 1) write out all dirty buffers, waiting if locked;
176 * 2) wait for completion by waiting for all buffers to unlock.
178 do {
179 retry = 0;
181 /* We search all lists as a failsafe mechanism, not because we expect
182 * there to be dirty buffers on any of the other lists.
184 repeat:
185 spin_lock(&lru_list_lock);
186 bh = lru_list[BUF_DIRTY];
187 if (!bh)
188 goto repeat2;
190 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
191 next = bh->b_next_free;
193 if (!lru_list[BUF_DIRTY])
194 break;
195 if (dev && bh->b_dev != dev)
196 continue;
197 if (buffer_locked(bh)) {
198 /* Buffer is locked; skip it unless wait is
199 * requested AND pass > 0.
201 if (!wait || !pass) {
202 retry = 1;
203 continue;
205 atomic_inc(&bh->b_count);
206 spin_unlock(&lru_list_lock);
207 wait_on_buffer (bh);
208 atomic_dec(&bh->b_count);
209 goto repeat;
212 /* If an unlocked buffer is not uptodate, there has
213 * been an IO error. Skip it.
215 if (wait && buffer_req(bh) && !buffer_locked(bh) &&
216 !buffer_dirty(bh) && !buffer_uptodate(bh)) {
217 err = -EIO;
218 continue;
221 /* Don't write clean buffers. Don't write ANY buffers
222 * on the third pass.
224 if (!buffer_dirty(bh) || pass >= 2)
225 continue;
227 atomic_inc(&bh->b_count);
228 bh->b_flushtime = 0;
229 spin_unlock(&lru_list_lock);
230 ll_rw_block(WRITE, 1, &bh);
231 atomic_dec(&bh->b_count);
232 retry = 1;
233 goto repeat;
236 repeat2:
237 bh = lru_list[BUF_LOCKED];
238 if (!bh) {
239 spin_unlock(&lru_list_lock);
240 break;
242 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
243 next = bh->b_next_free;
245 if (!lru_list[BUF_LOCKED])
246 break;
247 if (dev && bh->b_dev != dev)
248 continue;
249 if (buffer_locked(bh)) {
250 /* Buffer is locked; skip it unless wait is
251 * requested AND pass > 0.
253 if (!wait || !pass) {
254 retry = 1;
255 continue;
257 atomic_inc(&bh->b_count);
258 spin_unlock(&lru_list_lock);
259 wait_on_buffer (bh);
260 spin_lock(&lru_list_lock);
261 atomic_dec(&bh->b_count);
262 goto repeat2;
265 spin_unlock(&lru_list_lock);
267 /* If we are waiting for the sync to succeed, and if any dirty
268 * blocks were written, then repeat; on the second pass, only
269 * wait for buffers being written (do not pass to write any
270 * more buffers on the second pass).
272 } while (wait && retry && ++pass<=2);
273 return err;
276 void sync_dev(kdev_t dev)
278 sync_buffers(dev, 0);
279 sync_supers(dev);
280 sync_inodes(dev);
281 sync_buffers(dev, 0);
282 DQUOT_SYNC(dev);
284 * FIXME(eric) we need to sync the physical devices here.
285 * This is because some (scsi) controllers have huge amounts of
286 * cache onboard (hundreds of Mb), and we need to instruct
287 * them to commit all of the dirty memory to disk, and we should
288 * not return until this has happened.
290 * This would need to get implemented by going through the assorted
291 * layers so that each block major number can be synced, and this
292 * would call down into the upper and mid-layer scsi.
296 int fsync_dev(kdev_t dev)
298 sync_buffers(dev, 0);
300 lock_kernel();
301 sync_supers(dev);
302 sync_inodes(dev);
303 DQUOT_SYNC(dev);
304 unlock_kernel();
306 return sync_buffers(dev, 1);
309 asmlinkage int sys_sync(void)
311 fsync_dev(0);
312 return 0;
316 * filp may be NULL if called via the msync of a vma.
319 int file_fsync(struct file *filp, struct dentry *dentry)
321 struct inode * inode = dentry->d_inode;
322 struct super_block * sb;
323 kdev_t dev;
325 /* sync the inode to buffers */
326 write_inode_now(inode);
328 /* sync the superblock to buffers */
329 sb = inode->i_sb;
330 wait_on_super(sb);
331 if (sb->s_op && sb->s_op->write_super)
332 sb->s_op->write_super(sb);
334 /* .. finally sync the buffers to disk */
335 dev = inode->i_dev;
336 return sync_buffers(dev, 1);
339 asmlinkage int sys_fsync(unsigned int fd)
341 struct file * file;
342 struct dentry * dentry;
343 struct inode * inode;
344 int err;
346 lock_kernel();
347 err = -EBADF;
348 file = fget(fd);
349 if (!file)
350 goto out;
352 dentry = file->f_dentry;
353 if (!dentry)
354 goto out_putf;
356 inode = dentry->d_inode;
357 if (!inode)
358 goto out_putf;
360 err = -EINVAL;
361 if (!file->f_op || !file->f_op->fsync)
362 goto out_putf;
364 /* We need to protect against concurrent writers.. */
365 down(&inode->i_sem);
366 err = file->f_op->fsync(file, dentry);
367 up(&inode->i_sem);
369 out_putf:
370 fput(file);
371 out:
372 unlock_kernel();
373 return err;
376 asmlinkage int sys_fdatasync(unsigned int fd)
378 struct file * file;
379 struct dentry * dentry;
380 struct inode * inode;
381 int err;
383 lock_kernel();
384 err = -EBADF;
385 file = fget(fd);
386 if (!file)
387 goto out;
389 dentry = file->f_dentry;
390 if (!dentry)
391 goto out_putf;
393 inode = dentry->d_inode;
394 if (!inode)
395 goto out_putf;
397 err = -EINVAL;
398 if (!file->f_op || !file->f_op->fsync)
399 goto out_putf;
401 /* this needs further work, at the moment it is identical to fsync() */
402 down(&inode->i_sem);
403 err = file->f_op->fsync(file, dentry);
404 up(&inode->i_sem);
406 out_putf:
407 fput(file);
408 out:
409 unlock_kernel();
410 return err;
413 void invalidate_buffers(kdev_t dev)
415 int nlist;
417 spin_lock(&lru_list_lock);
418 for(nlist = 0; nlist < NR_LIST; nlist++) {
419 struct buffer_head * bh;
420 int i;
421 retry:
422 bh = lru_list[nlist];
423 if (!bh)
424 continue;
425 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
426 if (bh->b_dev != dev)
427 continue;
428 if (buffer_locked(bh)) {
429 atomic_inc(&bh->b_count);
430 spin_unlock(&lru_list_lock);
431 wait_on_buffer(bh);
432 spin_lock(&lru_list_lock);
433 atomic_dec(&bh->b_count);
434 goto retry;
436 if (atomic_read(&bh->b_count))
437 continue;
438 bh->b_flushtime = 0;
439 clear_bit(BH_Protected, &bh->b_state);
440 clear_bit(BH_Uptodate, &bh->b_state);
441 clear_bit(BH_Dirty, &bh->b_state);
442 clear_bit(BH_Req, &bh->b_state);
445 spin_unlock(&lru_list_lock);
448 /* After several hours of tedious analysis, the following hash
449 * function won. Do not mess with it... -DaveM
451 #define _hashfn(dev,block) \
452 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
453 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
454 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
456 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
458 if ((bh->b_next = *head) != NULL)
459 bh->b_next->b_pprev = &bh->b_next;
460 *head = bh;
461 bh->b_pprev = head;
464 static __inline__ void __hash_unlink(struct buffer_head *bh)
466 if (bh->b_next)
467 bh->b_next->b_pprev = bh->b_pprev;
468 *(bh->b_pprev) = bh->b_next;
469 bh->b_pprev = NULL;
472 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
474 struct buffer_head **bhp = &lru_list[blist];
476 if(!*bhp) {
477 *bhp = bh;
478 bh->b_prev_free = bh;
480 bh->b_next_free = *bhp;
481 bh->b_prev_free = (*bhp)->b_prev_free;
482 (*bhp)->b_prev_free->b_next_free = bh;
483 (*bhp)->b_prev_free = bh;
484 nr_buffers_type[blist]++;
487 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
489 if (bh->b_prev_free || bh->b_next_free) {
490 bh->b_prev_free->b_next_free = bh->b_next_free;
491 bh->b_next_free->b_prev_free = bh->b_prev_free;
492 if (lru_list[blist] == bh)
493 lru_list[blist] = bh->b_next_free;
494 if (lru_list[blist] == bh)
495 lru_list[blist] = NULL;
496 bh->b_next_free = bh->b_prev_free = NULL;
497 nr_buffers_type[blist]--;
501 static void __remove_from_free_list(struct buffer_head * bh, int index)
503 if(bh->b_next_free == bh)
504 free_list[index].list = NULL;
505 else {
506 bh->b_prev_free->b_next_free = bh->b_next_free;
507 bh->b_next_free->b_prev_free = bh->b_prev_free;
508 if (free_list[index].list == bh)
509 free_list[index].list = bh->b_next_free;
511 bh->b_next_free = bh->b_prev_free = NULL;
514 /* The following two functions must operate atomically
515 * because they control the visibility of a buffer head
516 * to the rest of the kernel.
518 static __inline__ void __remove_from_queues(struct buffer_head *bh)
520 write_lock(&hash_table_lock);
521 if (bh->b_pprev)
522 __hash_unlink(bh);
523 __remove_from_lru_list(bh, bh->b_list);
524 write_unlock(&hash_table_lock);
527 static void insert_into_queues(struct buffer_head *bh)
529 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
531 spin_lock(&lru_list_lock);
532 write_lock(&hash_table_lock);
533 __hash_link(bh, head);
534 __insert_into_lru_list(bh, bh->b_list);
535 write_unlock(&hash_table_lock);
536 spin_unlock(&lru_list_lock);
539 /* This function must only run if there are no other
540 * references _anywhere_ to this buffer head.
542 static void put_last_free(struct buffer_head * bh)
544 struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
545 struct buffer_head **bhp = &head->list;
547 spin_lock(&head->lock);
548 bh->b_dev = B_FREE;
549 if(!*bhp) {
550 *bhp = bh;
551 bh->b_prev_free = bh;
553 bh->b_next_free = *bhp;
554 bh->b_prev_free = (*bhp)->b_prev_free;
555 (*bhp)->b_prev_free->b_next_free = bh;
556 (*bhp)->b_prev_free = bh;
557 spin_unlock(&head->lock);
561 * Why like this, I hear you say... The reason is race-conditions.
562 * As we don't lock buffers (unless we are reading them, that is),
563 * something might happen to it while we sleep (ie a read-error
564 * will force it bad). This shouldn't really happen currently, but
565 * the code is ready.
567 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
569 struct buffer_head **head = &hash(dev, block);
570 struct buffer_head *bh;
572 read_lock(&hash_table_lock);
573 for(bh = *head; bh; bh = bh->b_next)
574 if (bh->b_blocknr == block &&
575 bh->b_size == size &&
576 bh->b_dev == dev)
577 break;
578 if (bh)
579 atomic_inc(&bh->b_count);
580 read_unlock(&hash_table_lock);
582 return bh;
585 unsigned int get_hardblocksize(kdev_t dev)
588 * Get the hard sector size for the given device. If we don't know
589 * what it is, return 0.
591 if (hardsect_size[MAJOR(dev)] != NULL) {
592 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
593 if (blksize != 0)
594 return blksize;
598 * We don't know what the hardware sector size for this device is.
599 * Return 0 indicating that we don't know.
601 return 0;
604 void set_blocksize(kdev_t dev, int size)
606 extern int *blksize_size[];
607 int i, nlist;
608 struct buffer_head * bh, *bhnext;
610 if (!blksize_size[MAJOR(dev)])
611 return;
613 /* Size must be a power of two, and between 512 and PAGE_SIZE */
614 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
615 panic("Invalid blocksize passed to set_blocksize");
617 if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
618 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
619 return;
621 if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
622 return;
623 sync_buffers(dev, 2);
624 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
626 /* We need to be quite careful how we do this - we are moving entries
627 * around on the free list, and we can get in a loop if we are not careful.
629 for(nlist = 0; nlist < NR_LIST; nlist++) {
630 repeat:
631 spin_lock(&lru_list_lock);
632 bh = lru_list[nlist];
633 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
634 if(!bh)
635 break;
637 bhnext = bh->b_next_free;
638 if (bh->b_dev != dev)
639 continue;
640 if (bh->b_size == size)
641 continue;
642 if (buffer_locked(bh)) {
643 atomic_inc(&bh->b_count);
644 spin_unlock(&lru_list_lock);
645 wait_on_buffer(bh);
646 atomic_dec(&bh->b_count);
647 goto repeat;
649 if (bh->b_dev == dev && bh->b_size != size) {
650 clear_bit(BH_Dirty, &bh->b_state);
651 clear_bit(BH_Uptodate, &bh->b_state);
652 clear_bit(BH_Req, &bh->b_state);
653 bh->b_flushtime = 0;
655 if (atomic_read(&bh->b_count) == 0) {
656 __remove_from_queues(bh);
657 put_last_free(bh);
660 spin_unlock(&lru_list_lock);
665 * We used to try various strange things. Let's not.
667 static void refill_freelist(int size)
669 if (!grow_buffers(size)) {
670 wakeup_bdflush(1);
671 current->policy |= SCHED_YIELD;
672 schedule();
676 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
678 bh->b_list = BUF_CLEAN;
679 bh->b_flushtime = 0;
680 bh->b_end_io = handler;
681 bh->b_dev_id = dev_id;
684 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
686 mark_buffer_uptodate(bh, uptodate);
687 unlock_buffer(bh);
690 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
692 mark_buffer_uptodate(bh, uptodate);
693 unlock_buffer(bh);
694 BUG();
697 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
699 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
700 unsigned long flags;
701 struct buffer_head *tmp;
702 struct page *page;
703 int free;
705 mark_buffer_uptodate(bh, uptodate);
707 /* This is a temporary buffer used for page I/O. */
708 page = mem_map + MAP_NR(bh->b_data);
710 if (!uptodate)
711 SetPageError(page);
714 * Be _very_ careful from here on. Bad things can happen if
715 * two buffer heads end IO at almost the same time and both
716 * decide that the page is now completely done.
718 * Async buffer_heads are here only as labels for IO, and get
719 * thrown away once the IO for this page is complete. IO is
720 * deemed complete once all buffers have been visited
721 * (b_count==0) and are now unlocked. We must make sure that
722 * only the _last_ buffer that decrements its count is the one
723 * that free's the page..
725 spin_lock_irqsave(&page_uptodate_lock, flags);
726 unlock_buffer(bh);
727 atomic_dec(&bh->b_count);
728 tmp = bh->b_this_page;
729 while (tmp != bh) {
730 if (atomic_read(&tmp->b_count) &&
731 (tmp->b_end_io == end_buffer_io_async))
732 goto still_busy;
733 tmp = tmp->b_this_page;
736 /* OK, the async IO on this page is complete. */
737 spin_unlock_irqrestore(&page_uptodate_lock, flags);
740 * if none of the buffers had errors then we can set the
741 * page uptodate:
743 if (!PageError(page))
744 SetPageUptodate(page);
747 * Run the hooks that have to be done when a page I/O has completed.
749 * Note - we need to test the flags before we unlock the page, but
750 * we must not actually free the page until after the unlock!
752 if (test_and_clear_bit(PG_decr_after, &page->flags))
753 atomic_dec(&nr_async_pages);
755 if (test_and_clear_bit(PG_free_swap_after, &page->flags))
756 swap_free(page->offset);
758 free = test_and_clear_bit(PG_free_after, &page->flags);
760 if (page->owner != -1)
761 PAGE_BUG(page);
762 page->owner = (int)current;
763 UnlockPage(page);
765 if (free)
766 __free_page(page);
768 return;
770 still_busy:
771 spin_unlock_irqrestore(&page_uptodate_lock, flags);
772 return;
777 * Ok, this is getblk, and it isn't very clear, again to hinder
778 * race-conditions. Most of the code is seldom used, (ie repeating),
779 * so it should be much more efficient than it looks.
781 * The algorithm is changed: hopefully better, and an elusive bug removed.
783 * 14.02.92: changed it to sync dirty buffers a bit: better performance
784 * when the filesystem starts to get full of dirty blocks (I hope).
786 struct buffer_head * getblk(kdev_t dev, int block, int size)
788 struct buffer_head * bh;
789 int isize;
791 repeat:
792 bh = get_hash_table(dev, block, size);
793 if (bh) {
794 if (!buffer_dirty(bh)) {
795 bh->b_flushtime = 0;
797 goto out;
800 isize = BUFSIZE_INDEX(size);
801 spin_lock(&free_list[isize].lock);
802 bh = free_list[isize].list;
803 if (bh) {
804 __remove_from_free_list(bh, isize);
805 atomic_set(&bh->b_count, 1);
807 spin_unlock(&free_list[isize].lock);
808 if (!bh)
809 goto refill;
811 /* OK, FINALLY we know that this buffer is the only one of its kind,
812 * we hold a reference (b_count>0), it is unlocked, and it is clean.
814 init_buffer(bh, end_buffer_io_sync, NULL);
815 bh->b_dev = dev;
816 bh->b_blocknr = block;
817 bh->b_state = 1 << BH_Mapped;
819 /* Insert the buffer into the regular lists */
820 insert_into_queues(bh);
821 goto out;
824 * If we block while refilling the free list, somebody may
825 * create the buffer first ... search the hashes again.
827 refill:
828 refill_freelist(size);
829 goto repeat;
830 out:
831 return bh;
835 * if a new dirty buffer is created we need to balance bdflush.
837 * in the future we might want to make bdflush aware of different
838 * pressures on different devices - thus the (currently unused)
839 * 'dev' parameter.
841 int too_many_dirty_buffers;
843 void balance_dirty(kdev_t dev)
845 int dirty = nr_buffers_type[BUF_DIRTY];
846 int ndirty = bdf_prm.b_un.ndirty;
848 if (dirty > ndirty) {
849 if (dirty > 2*ndirty) {
850 too_many_dirty_buffers = 1;
851 wakeup_bdflush(1);
852 return;
854 wakeup_bdflush(0);
856 too_many_dirty_buffers = 0;
857 return;
860 static inline void __mark_dirty(struct buffer_head *bh, int flag)
862 bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
863 clear_bit(BH_New, &bh->b_state);
864 refile_buffer(bh);
867 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
869 __mark_dirty(bh, flag);
873 * A buffer may need to be moved from one buffer list to another
874 * (e.g. in case it is not shared any more). Handle this.
876 static __inline__ void __refile_buffer(struct buffer_head *bh)
878 int dispose = BUF_CLEAN;
879 if (buffer_locked(bh))
880 dispose = BUF_LOCKED;
881 if (buffer_dirty(bh))
882 dispose = BUF_DIRTY;
883 if (dispose != bh->b_list) {
884 __remove_from_lru_list(bh, bh->b_list);
885 bh->b_list = dispose;
886 __insert_into_lru_list(bh, dispose);
890 void refile_buffer(struct buffer_head *bh)
892 spin_lock(&lru_list_lock);
893 __refile_buffer(bh);
894 spin_unlock(&lru_list_lock);
898 * Release a buffer head
900 void __brelse(struct buffer_head * buf)
902 touch_buffer(buf);
904 if (atomic_read(&buf->b_count)) {
905 atomic_dec(&buf->b_count);
906 return;
908 printk("VFS: brelse: Trying to free free buffer\n");
912 * bforget() is like brelse(), except it puts the buffer on the
913 * free list if it can.. We can NOT free the buffer if:
914 * - there are other users of it
915 * - it is locked and thus can have active IO
917 void __bforget(struct buffer_head * buf)
919 spin_lock(&lru_list_lock);
920 write_lock(&hash_table_lock);
921 if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
922 touch_buffer(buf);
923 atomic_dec(&buf->b_count);
924 } else {
925 atomic_set(&buf->b_count, 0);
926 buf->b_state = 0;
927 if (buf->b_pprev)
928 __hash_unlink(buf);
929 __remove_from_lru_list(buf, buf->b_list);
930 put_last_free(buf);
932 write_unlock(&hash_table_lock);
933 spin_unlock(&lru_list_lock);
937 * bread() reads a specified block and returns the buffer that contains
938 * it. It returns NULL if the block was unreadable.
940 struct buffer_head * bread(kdev_t dev, int block, int size)
942 struct buffer_head * bh;
944 bh = getblk(dev, block, size);
945 if (buffer_uptodate(bh))
946 return bh;
947 ll_rw_block(READ, 1, &bh);
948 wait_on_buffer(bh);
949 if (buffer_uptodate(bh))
950 return bh;
951 brelse(bh);
952 return NULL;
956 * Ok, breada can be used as bread, but additionally to mark other
957 * blocks for reading as well. End the argument list with a negative
958 * number.
961 #define NBUF 16
963 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
964 unsigned int pos, unsigned int filesize)
966 struct buffer_head * bhlist[NBUF];
967 unsigned int blocks;
968 struct buffer_head * bh;
969 int index;
970 int i, j;
972 if (pos >= filesize)
973 return NULL;
975 if (block < 0)
976 return NULL;
978 bh = getblk(dev, block, bufsize);
979 index = BUFSIZE_INDEX(bh->b_size);
981 if (buffer_uptodate(bh))
982 return(bh);
983 else ll_rw_block(READ, 1, &bh);
985 blocks = (filesize - pos) >> (9+index);
987 if (blocks < (read_ahead[MAJOR(dev)] >> index))
988 blocks = read_ahead[MAJOR(dev)] >> index;
989 if (blocks > NBUF)
990 blocks = NBUF;
992 /* if (blocks) printk("breada (new) %d blocks\n",blocks); */
994 bhlist[0] = bh;
995 j = 1;
996 for(i=1; i<blocks; i++) {
997 bh = getblk(dev,block+i,bufsize);
998 if (buffer_uptodate(bh)) {
999 brelse(bh);
1000 break;
1002 else bhlist[j++] = bh;
1005 /* Request the read for these buffers, and then release them. */
1006 if (j>1)
1007 ll_rw_block(READA, (j-1), bhlist+1);
1008 for(i=1; i<j; i++)
1009 brelse(bhlist[i]);
1011 /* Wait for this buffer, and then continue on. */
1012 bh = bhlist[0];
1013 wait_on_buffer(bh);
1014 if (buffer_uptodate(bh))
1015 return bh;
1016 brelse(bh);
1017 return NULL;
1021 * Note: the caller should wake up the buffer_wait list if needed.
1023 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1025 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1026 kmem_cache_free(bh_cachep, bh);
1027 } else {
1028 bh->b_blocknr = -1;
1029 init_waitqueue_head(&bh->b_wait);
1030 nr_unused_buffer_heads++;
1031 bh->b_next_free = unused_list;
1032 bh->b_this_page = NULL;
1033 unused_list = bh;
1037 static void put_unused_buffer_head(struct buffer_head *bh)
1039 spin_lock(&unused_list_lock);
1040 __put_unused_buffer_head(bh);
1041 spin_unlock(&unused_list_lock);
1045 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1046 * no-buffer-head deadlock. Return NULL on failure; waiting for
1047 * buffer heads is now handled in create_buffers().
1049 static struct buffer_head * get_unused_buffer_head(int async)
1051 struct buffer_head * bh;
1053 spin_lock(&unused_list_lock);
1054 if (nr_unused_buffer_heads > NR_RESERVED) {
1055 bh = unused_list;
1056 unused_list = bh->b_next_free;
1057 nr_unused_buffer_heads--;
1058 spin_unlock(&unused_list_lock);
1059 return bh;
1061 spin_unlock(&unused_list_lock);
1063 /* This is critical. We can't swap out pages to get
1064 * more buffer heads, because the swap-out may need
1065 * more buffer-heads itself. Thus SLAB_BUFFER.
1067 if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1068 memset(bh, 0, sizeof(*bh));
1069 init_waitqueue_head(&bh->b_wait);
1070 return bh;
1074 * If we need an async buffer, use the reserved buffer heads.
1076 if (async) {
1077 spin_lock(&unused_list_lock);
1078 if (unused_list) {
1079 bh = unused_list;
1080 unused_list = bh->b_next_free;
1081 nr_unused_buffer_heads--;
1082 spin_unlock(&unused_list_lock);
1083 return bh;
1085 spin_unlock(&unused_list_lock);
1087 #if 0
1089 * (Pending further analysis ...)
1090 * Ordinary (non-async) requests can use a different memory priority
1091 * to free up pages. Any swapping thus generated will use async
1092 * buffer heads.
1094 if(!async &&
1095 (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1096 memset(bh, 0, sizeof(*bh));
1097 init_waitqueue_head(&bh->b_wait);
1098 return bh;
1100 #endif
1102 return NULL;
1106 * Create the appropriate buffers when given a page for data area and
1107 * the size of each buffer.. Use the bh->b_this_page linked list to
1108 * follow the buffers created. Return NULL if unable to create more
1109 * buffers.
1110 * The async flag is used to differentiate async IO (paging, swapping)
1111 * from ordinary buffer allocations, and only async requests are allowed
1112 * to sleep waiting for buffer heads.
1114 static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
1116 DECLARE_WAITQUEUE(wait, current);
1117 struct buffer_head *bh, *head;
1118 long offset;
1120 try_again:
1121 head = NULL;
1122 offset = PAGE_SIZE;
1123 while ((offset -= size) >= 0) {
1124 bh = get_unused_buffer_head(async);
1125 if (!bh)
1126 goto no_grow;
1128 bh->b_dev = B_FREE; /* Flag as unused */
1129 bh->b_this_page = head;
1130 head = bh;
1132 bh->b_state = 0;
1133 bh->b_next_free = NULL;
1134 bh->b_pprev = NULL;
1135 atomic_set(&bh->b_count, 0);
1136 bh->b_size = size;
1138 bh->b_data = (char *) (page+offset);
1139 bh->b_list = BUF_CLEAN;
1140 bh->b_flushtime = 0;
1141 bh->b_end_io = end_buffer_io_bad;
1143 return head;
1145 * In case anything failed, we just free everything we got.
1147 no_grow:
1148 if (head) {
1149 do {
1150 bh = head;
1151 head = head->b_this_page;
1152 put_unused_buffer_head(bh);
1153 } while (head);
1155 /* Wake up any waiters ... */
1156 wake_up(&buffer_wait);
1160 * Return failure for non-async IO requests. Async IO requests
1161 * are not allowed to fail, so we have to wait until buffer heads
1162 * become available. But we don't want tasks sleeping with
1163 * partially complete buffers, so all were released above.
1165 if (!async)
1166 return NULL;
1168 /* We're _really_ low on memory. Now we just
1169 * wait for old buffer heads to become free due to
1170 * finishing IO. Since this is an async request and
1171 * the reserve list is empty, we're sure there are
1172 * async buffer heads in use.
1174 run_task_queue(&tq_disk);
1177 * Set our state for sleeping, then check again for buffer heads.
1178 * This ensures we won't miss a wake_up from an interrupt.
1180 add_wait_queue(&buffer_wait, &wait);
1181 current->state = TASK_UNINTERRUPTIBLE;
1182 if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1183 current->policy |= SCHED_YIELD;
1184 schedule();
1186 remove_wait_queue(&buffer_wait, &wait);
1187 current->state = TASK_RUNNING;
1188 goto try_again;
1191 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1193 struct buffer_head *head, *bh, *tail;
1194 int block;
1196 if (!PageLocked(page))
1197 BUG();
1198 if (page->owner != (int)current)
1199 PAGE_BUG(page);
1201 * Allocate async buffer heads pointing to this page, just for I/O.
1202 * They show up in the buffer hash table and are registered in
1203 * page->buffers.
1205 head = create_buffers(page_address(page), size, 1);
1206 if (page->buffers)
1207 BUG();
1208 if (!head)
1209 BUG();
1210 tail = head;
1211 for (bh = head; bh; bh = bh->b_this_page) {
1212 block = *(b++);
1214 tail = bh;
1215 init_buffer(bh, end_buffer_io_async, NULL);
1216 bh->b_dev = dev;
1217 bh->b_blocknr = block;
1220 * When we use bmap, we define block zero to represent
1221 * a hole. ll_rw_page, however, may legitimately
1222 * access block zero, and we need to distinguish the
1223 * two cases.
1225 if (bmap && !block) {
1226 memset(bh->b_data, 0, size);
1227 set_bit(BH_Uptodate, &bh->b_state);
1228 continue;
1230 set_bit(BH_Mapped, &bh->b_state);
1232 tail->b_this_page = head;
1233 get_page(page);
1234 page->buffers = head;
1235 return 0;
1239 * We don't have to release all buffers here, but
1240 * we have to be sure that no dirty buffer is left
1241 * and no IO is going on (no buffer is locked), because
1242 * we have truncated the file and are going to free the
1243 * blocks on-disk..
1245 int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1247 struct buffer_head *head, *bh, *next;
1248 unsigned int curr_off = 0;
1250 if (!PageLocked(page))
1251 BUG();
1252 if (!page->buffers)
1253 return 0;
1255 head = page->buffers;
1256 bh = head;
1257 do {
1258 unsigned int next_off = curr_off + bh->b_size;
1259 next = bh->b_this_page;
1262 * is this block fully flushed?
1264 if (offset <= curr_off) {
1265 if (buffer_mapped(bh)) {
1266 atomic_inc(&bh->b_count);
1267 wait_on_buffer(bh);
1268 if (bh->b_dev == B_FREE)
1269 BUG();
1270 mark_buffer_clean(bh);
1271 clear_bit(BH_Uptodate, &bh->b_state);
1272 clear_bit(BH_Mapped, &bh->b_state);
1273 bh->b_blocknr = 0;
1274 atomic_dec(&bh->b_count);
1277 curr_off = next_off;
1278 bh = next;
1279 } while (bh != head);
1282 * subtle. We release buffer-heads only if this is
1283 * the 'final' flushpage. We have invalidated the bmap
1284 * cached value unconditionally, so real IO is not
1285 * possible anymore.
1287 * If the free doesn't work out, the buffers can be
1288 * left around - they just turn into anonymous buffers
1289 * instead.
1291 if (!offset) {
1292 if (!try_to_free_buffers(page))
1293 atomic_add(PAGE_CACHE_SIZE, &buffermem);
1296 return 0;
1299 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1301 struct buffer_head *bh, *head, *tail;
1303 head = create_buffers(page_address(page), blocksize, 1);
1304 if (page->buffers)
1305 BUG();
1307 bh = head;
1308 do {
1309 bh->b_dev = inode->i_dev;
1310 bh->b_blocknr = 0;
1311 bh->b_end_io = end_buffer_io_bad;
1312 tail = bh;
1313 bh = bh->b_this_page;
1314 } while (bh);
1315 tail->b_this_page = head;
1316 page->buffers = head;
1317 get_page(page);
1321 * block_write_full_page() is SMP-safe - currently it's still
1322 * being called with the kernel lock held, but the code is ready.
1324 int block_write_full_page(struct file *file, struct page *page)
1326 struct dentry *dentry = file->f_dentry;
1327 struct inode *inode = dentry->d_inode;
1328 int err, i;
1329 unsigned long block, offset;
1330 struct buffer_head *bh, *head;
1332 if (!PageLocked(page))
1333 BUG();
1335 if (!page->buffers)
1336 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1337 head = page->buffers;
1339 offset = page->offset;
1340 block = offset >> inode->i_sb->s_blocksize_bits;
1342 // FIXME: currently we assume page alignment.
1343 if (offset & (PAGE_SIZE-1))
1344 BUG();
1346 bh = head;
1347 i = 0;
1348 do {
1349 if (!bh)
1350 BUG();
1353 * If the buffer isn't up-to-date, we can't be sure
1354 * that the buffer has been initialized with the proper
1355 * block number information etc..
1357 * Leave it to the low-level FS to make all those
1358 * decisions (block #0 may actually be a valid block)
1360 bh->b_end_io = end_buffer_io_sync;
1361 if (!buffer_mapped(bh)) {
1362 err = inode->i_op->get_block(inode, block, bh, 1);
1363 if (err)
1364 goto out;
1366 set_bit(BH_Uptodate, &bh->b_state);
1367 mark_buffer_dirty(bh,0);
1369 bh = bh->b_this_page;
1370 block++;
1371 } while (bh != head);
1373 SetPageUptodate(page);
1374 return 0;
1375 out:
1376 ClearPageUptodate(page);
1377 return err;
1380 int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
1382 struct dentry *dentry = file->f_dentry;
1383 struct inode *inode = dentry->d_inode;
1384 unsigned long block;
1385 int err, partial;
1386 unsigned long blocksize, start_block, end_block;
1387 unsigned long start_offset, start_bytes, end_bytes;
1388 unsigned long bbits, blocks, i, len;
1389 struct buffer_head *bh, *head;
1390 char * target_buf;
1392 target_buf = (char *)page_address(page) + offset;
1394 if (!PageLocked(page))
1395 BUG();
1397 blocksize = inode->i_sb->s_blocksize;
1398 if (!page->buffers)
1399 create_empty_buffers(page, inode, blocksize);
1400 head = page->buffers;
1402 bbits = inode->i_sb->s_blocksize_bits;
1403 block = page->offset >> bbits;
1404 blocks = PAGE_SIZE >> bbits;
1405 start_block = offset >> bbits;
1406 end_block = (offset + bytes - 1) >> bbits;
1407 start_offset = offset & (blocksize - 1);
1408 start_bytes = blocksize - start_offset;
1409 if (start_bytes > bytes)
1410 start_bytes = bytes;
1411 end_bytes = (offset+bytes) & (blocksize - 1);
1412 if (end_bytes > bytes)
1413 end_bytes = bytes;
1415 if (offset < 0 || offset >= PAGE_SIZE)
1416 BUG();
1417 if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1418 BUG();
1419 if (start_block < 0 || start_block >= blocks)
1420 BUG();
1421 if (end_block < 0 || end_block >= blocks)
1422 BUG();
1423 // FIXME: currently we assume page alignment.
1424 if (page->offset & (PAGE_SIZE-1))
1425 BUG();
1427 i = 0;
1428 bh = head;
1429 partial = 0;
1430 do {
1431 if (!bh)
1432 BUG();
1434 if ((i < start_block) || (i > end_block)) {
1435 if (!buffer_uptodate(bh))
1436 partial = 1;
1437 goto skip;
1441 * If the buffer is not up-to-date, we need to ask the low-level
1442 * FS to do something for us (we used to have assumptions about
1443 * the meaning of b_blocknr etc, that's bad).
1445 * If "update" is set, that means that the low-level FS should
1446 * try to make sure that the block is up-to-date because we're
1447 * not going to fill it completely.
1449 bh->b_end_io = end_buffer_io_sync;
1450 if (!buffer_mapped(bh)) {
1451 err = inode->i_op->get_block(inode, block, bh, 1);
1452 if (err)
1453 goto out;
1456 if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1457 if (buffer_new(bh)) {
1458 memset(bh->b_data, 0, bh->b_size);
1459 } else {
1460 ll_rw_block(READ, 1, &bh);
1461 wait_on_buffer(bh);
1462 err = -EIO;
1463 if (!buffer_uptodate(bh))
1464 goto out;
1468 len = blocksize;
1469 if (start_offset) {
1470 len = start_bytes;
1471 start_offset = 0;
1472 } else if (end_bytes && (i == end_block)) {
1473 len = end_bytes;
1474 end_bytes = 0;
1476 err = copy_from_user(target_buf, buf, len);
1477 target_buf += len;
1478 buf += len;
1481 * we dirty buffers only after copying the data into
1482 * the page - this way we can dirty the buffer even if
1483 * the bh is still doing IO.
1485 * NOTE! This also does a direct dirty balace check,
1486 * rather than relying on bdflush just waking up every
1487 * once in a while. This is to catch (and slow down)
1488 * the processes that write tons of buffer..
1490 * Note how we do NOT want to do this in the full block
1491 * case: full pages are flushed not by the people who
1492 * dirtied them, but by people who need memory. And we
1493 * should not penalize them for somebody else writing
1494 * lots of dirty pages.
1496 set_bit(BH_Uptodate, &bh->b_state);
1497 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1498 __mark_dirty(bh, 0);
1499 if (too_many_dirty_buffers)
1500 balance_dirty(bh->b_dev);
1503 if (err) {
1504 err = -EFAULT;
1505 goto out;
1508 skip:
1509 i++;
1510 block++;
1511 bh = bh->b_this_page;
1512 } while (bh != head);
1515 * is this a partial write that happened to make all buffers
1516 * uptodate then we can optimize away a bogus readpage() for
1517 * the next read(). Here we 'discover' wether the page went
1518 * uptodate as a result of this (potentially partial) write.
1520 if (!partial)
1521 SetPageUptodate(page);
1522 return bytes;
1523 out:
1524 ClearPageUptodate(page);
1525 return err;
1529 * Start I/O on a page.
1530 * This function expects the page to be locked and may return
1531 * before I/O is complete. You then have to check page->locked,
1532 * page->uptodate, and maybe wait on page->wait.
1534 * brw_page() is SMP-safe, although it's being called with the
1535 * kernel lock held - but the code is ready.
1537 * FIXME: we need a swapper_inode->get_block function to remove
1538 * some of the bmap kludges and interface ugliness here.
1540 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1542 struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1543 int nr, fresh /* temporary debugging flag */, block;
1545 if (!PageLocked(page))
1546 panic("brw_page: page not locked for I/O");
1547 // clear_bit(PG_error, &page->flags);
1549 * We pretty much rely on the page lock for this, because
1550 * create_page_buffers() might sleep.
1552 fresh = 0;
1553 if (!page->buffers) {
1554 create_page_buffers(rw, page, dev, b, size, bmap);
1555 fresh = 1;
1557 if (!page->buffers)
1558 BUG();
1559 page->owner = -1;
1561 head = page->buffers;
1562 bh = head;
1563 nr = 0;
1564 do {
1565 block = *(b++);
1567 if (fresh && (atomic_read(&bh->b_count) != 0))
1568 BUG();
1569 if (rw == READ) {
1570 if (!fresh)
1571 BUG();
1572 if (bmap && !block) {
1573 if (block)
1574 BUG();
1575 } else {
1576 if (bmap && !block)
1577 BUG();
1578 if (!buffer_uptodate(bh)) {
1579 arr[nr++] = bh;
1580 atomic_inc(&bh->b_count);
1583 } else { /* WRITE */
1584 if (!bh->b_blocknr) {
1585 if (!block)
1586 BUG();
1587 bh->b_blocknr = block;
1588 } else {
1589 if (!block)
1590 BUG();
1592 set_bit(BH_Uptodate, &bh->b_state);
1593 set_bit(BH_Dirty, &bh->b_state);
1594 arr[nr++] = bh;
1595 atomic_inc(&bh->b_count);
1597 bh = bh->b_this_page;
1598 } while (bh != head);
1599 if (rw == READ)
1600 ++current->mm->maj_flt;
1601 if ((rw == READ) && nr) {
1602 if (Page_Uptodate(page))
1603 BUG();
1604 ll_rw_block(rw, nr, arr);
1605 } else {
1606 if (!nr && rw == READ) {
1607 SetPageUptodate(page);
1608 page->owner = (int)current;
1609 UnlockPage(page);
1611 if (nr && (rw == WRITE))
1612 ll_rw_block(rw, nr, arr);
1614 return 0;
1618 * Generic "read page" function for block devices that have the normal
1619 * bmap functionality. This is most of the block device filesystems.
1620 * Reads the page asynchronously --- the unlock_buffer() and
1621 * mark_buffer_uptodate() functions propagate buffer state into the
1622 * page struct once IO has completed.
1624 int block_read_full_page(struct file * file, struct page * page)
1626 struct dentry *dentry = file->f_dentry;
1627 struct inode *inode = dentry->d_inode;
1628 unsigned long iblock;
1629 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1630 unsigned int blocksize, blocks;
1631 int nr;
1633 if (!PageLocked(page))
1634 PAGE_BUG(page);
1635 blocksize = inode->i_sb->s_blocksize;
1636 if (!page->buffers)
1637 create_empty_buffers(page, inode, blocksize);
1638 head = page->buffers;
1640 blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1641 iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1642 page->owner = -1;
1643 head = page->buffers;
1644 bh = head;
1645 nr = 0;
1647 do {
1648 if (buffer_uptodate(bh))
1649 continue;
1651 if (!buffer_mapped(bh)) {
1652 inode->i_op->get_block(inode, iblock, bh, 0);
1653 if (!buffer_mapped(bh)) {
1654 memset(bh->b_data, 0, blocksize);
1655 set_bit(BH_Uptodate, &bh->b_state);
1656 continue;
1660 init_buffer(bh, end_buffer_io_async, NULL);
1661 atomic_inc(&bh->b_count);
1662 arr[nr] = bh;
1663 nr++;
1664 } while (iblock++, (bh = bh->b_this_page) != head);
1666 ++current->mm->maj_flt;
1667 if (nr) {
1668 if (Page_Uptodate(page))
1669 BUG();
1670 ll_rw_block(READ, nr, arr);
1671 } else {
1673 * all buffers are uptodate - we can set the page
1674 * uptodate as well.
1676 SetPageUptodate(page);
1677 page->owner = (int)current;
1678 UnlockPage(page);
1680 return 0;
1684 * Try to increase the number of buffers available: the size argument
1685 * is used to determine what kind of buffers we want.
1687 static int grow_buffers(int size)
1689 unsigned long page;
1690 struct buffer_head *bh, *tmp;
1691 struct buffer_head * insert_point;
1692 int isize;
1694 if ((size & 511) || (size > PAGE_SIZE)) {
1695 printk("VFS: grow_buffers: size = %d\n",size);
1696 return 0;
1699 if (!(page = __get_free_page(GFP_BUFFER)))
1700 return 0;
1701 bh = create_buffers(page, size, 0);
1702 if (!bh) {
1703 free_page(page);
1704 return 0;
1707 isize = BUFSIZE_INDEX(size);
1709 spin_lock(&free_list[isize].lock);
1710 insert_point = free_list[isize].list;
1711 tmp = bh;
1712 while (1) {
1713 if (insert_point) {
1714 tmp->b_next_free = insert_point->b_next_free;
1715 tmp->b_prev_free = insert_point;
1716 insert_point->b_next_free->b_prev_free = tmp;
1717 insert_point->b_next_free = tmp;
1718 } else {
1719 tmp->b_prev_free = tmp;
1720 tmp->b_next_free = tmp;
1722 insert_point = tmp;
1723 if (tmp->b_this_page)
1724 tmp = tmp->b_this_page;
1725 else
1726 break;
1728 tmp->b_this_page = bh;
1729 free_list[isize].list = bh;
1730 spin_unlock(&free_list[isize].lock);
1732 mem_map[MAP_NR(page)].buffers = bh;
1733 atomic_add(PAGE_SIZE, &buffermem);
1734 return 1;
1738 * Can the buffer be thrown out?
1740 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1741 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1744 * try_to_free_buffers() checks if all the buffers on this particular page
1745 * are unused, and free's the page if so.
1747 * Wake up bdflush() if this fails - if we're running low on memory due
1748 * to dirty buffers, we need to flush them out as quickly as possible.
1750 * NOTE: There are quite a number of ways that threads of control can
1751 * obtain a reference to a buffer head within a page. So we must
1752 * lock out all of these paths to cleanly toss the page.
1754 int try_to_free_buffers(struct page * page)
1756 struct buffer_head * tmp, * bh = page->buffers;
1757 int index = BUFSIZE_INDEX(bh->b_size);
1758 int ret;
1760 spin_lock(&lru_list_lock);
1761 write_lock(&hash_table_lock);
1762 spin_lock(&free_list[index].lock);
1763 tmp = bh;
1764 do {
1765 struct buffer_head * p = tmp;
1767 tmp = tmp->b_this_page;
1768 if (buffer_busy(p))
1769 goto busy_buffer_page;
1770 } while (tmp != bh);
1772 spin_lock(&unused_list_lock);
1773 tmp = bh;
1774 do {
1775 struct buffer_head * p = tmp;
1776 tmp = tmp->b_this_page;
1778 /* The buffer can be either on the regular
1779 * queues or on the free list..
1781 if (p->b_dev == B_FREE) {
1782 __remove_from_free_list(p, index);
1783 } else {
1784 if (p->b_pprev)
1785 __hash_unlink(p);
1786 __remove_from_lru_list(p, p->b_list);
1788 __put_unused_buffer_head(p);
1789 } while (tmp != bh);
1790 spin_unlock(&unused_list_lock);
1792 /* Wake up anyone waiting for buffer heads */
1793 wake_up(&buffer_wait);
1795 /* And free the page */
1796 page->buffers = NULL;
1797 __free_page(page);
1798 ret = 1;
1799 out:
1800 spin_unlock(&free_list[index].lock);
1801 write_unlock(&hash_table_lock);
1802 spin_unlock(&lru_list_lock);
1803 return ret;
1805 busy_buffer_page:
1806 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
1807 too_many_dirty_buffers = 1;
1808 wakeup_bdflush(0);
1809 ret = 0;
1810 goto out;
1813 /* ===================== Init ======================= */
1816 * allocate the hash table and init the free list
1817 * Use gfp() for the hash table to decrease TLB misses, use
1818 * SLAB cache for buffer heads.
1820 void __init buffer_init(unsigned long memory_size)
1822 int order, i;
1823 unsigned int nr_hash;
1825 /* The buffer cache hash table is less important these days,
1826 * trim it a bit.
1828 memory_size >>= 14;
1829 memory_size *= sizeof(struct buffer_head *);
1830 for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
1833 /* try to allocate something until we get it or we're asking
1834 for something that is really too small */
1836 do {
1837 unsigned long tmp;
1839 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
1840 bh_hash_mask = (nr_hash - 1);
1842 tmp = nr_hash;
1843 bh_hash_shift = 0;
1844 while((tmp >>= 1UL) != 0UL)
1845 bh_hash_shift++;
1847 hash_table = (struct buffer_head **)
1848 __get_free_pages(GFP_ATOMIC, order);
1849 } while (hash_table == NULL && --order > 0);
1850 printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
1851 nr_hash, order, (1UL<<order) * PAGE_SIZE);
1853 if (!hash_table)
1854 panic("Failed to allocate buffer hash table\n");
1856 /* Setup hash chains. */
1857 for(i = 0; i < nr_hash; i++)
1858 hash_table[i] = NULL;
1860 /* Setup free lists. */
1861 for(i = 0; i < NR_SIZES; i++) {
1862 free_list[i].list = NULL;
1863 free_list[i].lock = SPIN_LOCK_UNLOCKED;
1866 /* Setup lru lists. */
1867 for(i = 0; i < NR_LIST; i++)
1868 lru_list[i] = NULL;
1870 bh_cachep = kmem_cache_create("buffer_head",
1871 sizeof(struct buffer_head),
1873 SLAB_HWCACHE_ALIGN, NULL, NULL);
1874 if(!bh_cachep)
1875 panic("Cannot create buffer head SLAB cache\n");
1879 /* ====================== bdflush support =================== */
1881 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1882 * response to dirty buffers. Once this process is activated, we write back
1883 * a limited number of buffers to the disks and then go back to sleep again.
1885 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1886 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1887 struct task_struct *bdflush_tsk = 0;
1889 void wakeup_bdflush(int wait)
1891 if (current == bdflush_tsk)
1892 return;
1893 if (wait)
1894 run_task_queue(&tq_disk);
1895 wake_up(&bdflush_wait);
1896 if (wait)
1897 sleep_on(&bdflush_done);
1902 * Here we attempt to write back old buffers. We also try to flush inodes
1903 * and supers as well, since this function is essentially "update", and
1904 * otherwise there would be no way of ensuring that these quantities ever
1905 * get written back. Ideally, we would have a timestamp on the inodes
1906 * and superblocks so that we could write back only the old ones as well
1909 static int sync_old_buffers(void)
1911 int nlist;
1913 lock_kernel();
1914 sync_supers(0);
1915 sync_inodes(0);
1916 unlock_kernel();
1918 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
1919 struct buffer_head *bh;
1920 repeat:
1921 spin_lock(&lru_list_lock);
1922 bh = lru_list[nlist];
1923 if(bh) {
1924 struct buffer_head *next;
1925 int i;
1926 for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1927 next = bh->b_next_free;
1929 /* If the buffer is not on the proper list,
1930 * then refile it.
1932 if ((nlist == BUF_DIRTY &&
1933 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
1934 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
1935 __refile_buffer(bh);
1936 continue;
1939 if (buffer_locked(bh) || !buffer_dirty(bh))
1940 continue;
1942 /* OK, now we are committed to write it out. */
1943 bh->b_flushtime = 0;
1944 atomic_inc(&bh->b_count);
1945 spin_unlock(&lru_list_lock);
1946 ll_rw_block(WRITE, 1, &bh);
1947 atomic_dec(&bh->b_count);
1948 goto repeat;
1951 spin_unlock(&lru_list_lock);
1953 run_task_queue(&tq_disk);
1954 return 0;
1958 /* This is the interface to bdflush. As we get more sophisticated, we can
1959 * pass tuning parameters to this "process", to adjust how it behaves.
1960 * We would want to verify each parameter, however, to make sure that it
1961 * is reasonable. */
1963 asmlinkage int sys_bdflush(int func, long data)
1965 int i, error = -EPERM;
1967 if (!capable(CAP_SYS_ADMIN))
1968 goto out;
1970 if (func == 1) {
1971 error = sync_old_buffers();
1972 goto out;
1975 /* Basically func 1 means read param 1, 2 means write param 1, etc */
1976 if (func >= 2) {
1977 i = (func-2) >> 1;
1978 error = -EINVAL;
1979 if (i < 0 || i >= N_PARAM)
1980 goto out;
1981 if((func & 1) == 0) {
1982 error = put_user(bdf_prm.data[i], (int*)data);
1983 goto out;
1985 if (data < bdflush_min[i] || data > bdflush_max[i])
1986 goto out;
1987 bdf_prm.data[i] = data;
1988 error = 0;
1989 goto out;
1992 /* Having func 0 used to launch the actual bdflush and then never
1993 * return (unless explicitly killed). We return zero here to
1994 * remain semi-compatible with present update(8) programs.
1996 error = 0;
1997 out:
1998 return error;
2002 * This is the actual bdflush daemon itself. It used to be started from
2003 * the syscall above, but now we launch it ourselves internally with
2004 * kernel_thread(...) directly after the first thread in init/main.c
2006 int bdflush(void * unused)
2009 * We have a bare-bones task_struct, and really should fill
2010 * in a few more things so "top" and /proc/2/{exe,root,cwd}
2011 * display semi-sane things. Not real crucial though...
2014 current->session = 1;
2015 current->pgrp = 1;
2016 sprintf(current->comm, "kflushd");
2017 bdflush_tsk = current;
2019 for (;;) {
2020 int nlist;
2022 CHECK_EMERGENCY_SYNC
2024 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2025 int nr, major, written = 0;
2026 struct buffer_head *next;
2028 repeat:
2029 spin_lock(&lru_list_lock);
2030 next = lru_list[nlist];
2031 nr = nr_buffers_type[nlist];
2032 while (nr-- > 0) {
2033 struct buffer_head *bh = next;
2035 next = next->b_next_free;
2037 /* If the buffer is not on the correct list,
2038 * then refile it.
2040 if ((nlist == BUF_DIRTY &&
2041 (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2042 (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2043 __refile_buffer(bh);
2044 continue;
2047 /* If we aren't in panic mode, don't write out too much
2048 * at a time. Also, don't write out buffers we don't
2049 * really have to write out yet..
2051 if (!too_many_dirty_buffers) {
2052 if (written > bdf_prm.b_un.ndirty)
2053 break;
2054 if (time_before(jiffies, bh->b_flushtime))
2055 continue;
2058 if (buffer_locked(bh) || !buffer_dirty(bh))
2059 continue;
2061 major = MAJOR(bh->b_dev);
2062 written++;
2063 bh->b_flushtime = 0;
2066 * For the loop major we can try to do asynchronous writes,
2067 * but we have to guarantee that we're making some progress..
2069 atomic_inc(&bh->b_count);
2070 spin_unlock(&lru_list_lock);
2071 if (major == LOOP_MAJOR && written > 1) {
2072 ll_rw_block(WRITEA, 1, &bh);
2073 if (buffer_dirty(bh))
2074 --written;
2075 } else
2076 ll_rw_block(WRITE, 1, &bh);
2077 atomic_dec(&bh->b_count);
2078 goto repeat;
2080 spin_unlock(&lru_list_lock);
2082 run_task_queue(&tq_disk);
2083 wake_up(&bdflush_done);
2086 * If there are still a lot of dirty buffers around,
2087 * skip the sleep and flush some more. Otherwise, we
2088 * sleep for a while and mark us as not being in panic
2089 * mode..
2091 if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2092 too_many_dirty_buffers = 0;
2093 spin_lock_irq(&current->sigmask_lock);
2094 flush_signals(current);
2095 spin_unlock_irq(&current->sigmask_lock);
2096 interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);