fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 #include <linux/sched.h>
  28 #include <linux/kernel.h>
  29 #include <linux/major.h>
  30 #include <linux/string.h>
  31 #include <linux/locks.h>
  32 #include <linux/errno.h>
  33 #include <linux/malloc.h>
  34 #include <linux/slab.h>
  35 #include <linux/pagemap.h>
  36 #include <linux/swap.h>
  37 #include <linux/swapctl.h>
  38 #include <linux/smp.h>
  39 #include <linux/smp_lock.h>
  40 #include <linux/vmalloc.h>
  41 #include <linux/blkdev.h>
  42 #include <linux/sysrq.h>
  43 #include <linux/file.h>
  44 #include <linux/init.h>
  45 #include <linux/quotaops.h>
  46
  47 #include <asm/system.h>
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51
  52 #define NR_SIZES 7
  53 static char buffersize_index[65] =
  54 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  55   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  56   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58   6};
  59
  60 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  61 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  62 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  63 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  64                                              number of unused buffer heads */
  65
  66 /*
  67  * Hash table mask..
  68  */
  69 static unsigned long bh_hash_mask = 0;
  70
  71 static int grow_buffers(int pri, int size);
  72
  73 static struct buffer_head ** hash_table;
  74 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
  75 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
  76
  77 static kmem_cache_t *bh_cachep;
  78
  79 static struct buffer_head * unused_list = NULL;
  80 static struct buffer_head * reuse_list = NULL;
  81 static struct wait_queue * buffer_wait = NULL;
  82
  83 static int nr_buffers = 0;
  84 static int nr_buffers_type[NR_LIST] = {0,};
  85 static int nr_buffer_heads = 0;
  86 static int nr_unused_buffer_heads = 0;
  87 static int refilled = 0;       /* Set NZ when a buffer freelist is refilled
  88                                   this is used by the loop device */
  89
  90 /* This is used by some architectures to estimate available memory. */
  91 int buffermem = 0;
  92
  93 /* Here is the parameter block for the bdflush process. If you add or
  94  * remove any of the parameters, make sure to update kernel/sysctl.c.
  95  */
  96
  97 #define N_PARAM 9
  98
  99 /* The dummy values in this structure are left in there for compatibility
 100  * with old programs that play with the /proc entries.
 101  */
 102 union bdflush_param{
 103         struct {
 104                 int nfract;  /* Percentage of buffer cache dirty to
 105                                 activate bdflush */
 106                 int ndirty;  /* Maximum number of dirty blocks to write out per
 107                                 wake-cycle */
 108                 int nrefill; /* Number of clean buffers to try to obtain
 109                                 each time we call refill */
 110                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 111                                   when trying to refill buffers. */
 112                 int dummy1;    /* unused */
 113                 int age_buffer;  /* Time for normal buffer to age before
 114                                     we flush it */
 115                 int age_super;  /* Time for superblock to age before we
 116                                    flush it */
 117                 int dummy2;    /* unused */
 118                 int dummy3;    /* unused */
 119         } b_un;
 120         unsigned int data[N_PARAM];
 121 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
 122
 123 /* These are the min and max parameter values that we will allow to be assigned */
 124 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   100,   100, 1, 1};
 125 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 60000, 60000, 2047, 5};
 126
 127 void wakeup_bdflush(int);
 128
 129 /*
 130  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 131  * and getting rid of the cli-sti pairs. The wait-queue routines still
 132  * need cli-sti, but now it's just a couple of 386 instructions or so.
 133  *
 134  * Note that the real wait_on_buffer() is an inline function that checks
 135  * if 'b_wait' is set before calling this, so that the queues aren't set
 136  * up unnecessarily.
 137  */
 138 void __wait_on_buffer(struct buffer_head * bh)
 139 {
 140         struct task_struct *tsk = current;
 141         struct wait_queue wait;
 142
 143         bh->b_count++;
 144         wait.task = tsk;
 145         add_wait_queue(&bh->b_wait, &wait);
 146 repeat:
 147         tsk->state = TASK_UNINTERRUPTIBLE;
 148         run_task_queue(&tq_disk);
 149         if (buffer_locked(bh)) {
 150                 schedule();
 151                 goto repeat;
 152         }
 153         tsk->state = TASK_RUNNING;
 154         remove_wait_queue(&bh->b_wait, &wait);
 155         bh->b_count--;
 156 }
 157
 158 /* Call sync_buffers with wait!=0 to ensure that the call does not
 159  * return until all buffer writes have completed.  Sync() may return
 160  * before the writes have finished; fsync() may not.
 161  */
 162
 163 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 164  * spontaneously dirty themselves without ever brelse being called.
 165  * We will ultimately want to put these in a separate list, but for
 166  * now we search all of the lists for dirty buffers.
 167  */
 168 static int sync_buffers(kdev_t dev, int wait)
 169 {
 170         int i, retry, pass = 0, err = 0;
 171         struct buffer_head * bh, *next;
 172
 173         /* One pass for no-wait, three for wait:
 174          * 0) write out all dirty, unlocked buffers;
 175          * 1) write out all dirty buffers, waiting if locked;
 176          * 2) wait for completion by waiting for all buffers to unlock.
 177          */
 178         do {
 179                 retry = 0;
 180 repeat:
 181                 /* We search all lists as a failsafe mechanism, not because we expect
 182                  * there to be dirty buffers on any of the other lists.
 183                  */
 184                 bh = lru_list[BUF_DIRTY];
 185                 if (!bh)
 186                         goto repeat2;
 187                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 188                         if (bh->b_list != BUF_DIRTY)
 189                                 goto repeat;
 190                         next = bh->b_next_free;
 191                         if (!lru_list[BUF_DIRTY])
 192                                 break;
 193                         if (dev && bh->b_dev != dev)
 194                                 continue;
 195                         if (buffer_locked(bh)) {
 196                                 /* Buffer is locked; skip it unless wait is
 197                                  * requested AND pass > 0.
 198                                  */
 199                                 if (!wait || !pass) {
 200                                         retry = 1;
 201                                         continue;
 202                                 }
 203                                 wait_on_buffer (bh);
 204                                 goto repeat;
 205                         }
 206
 207                         /* If an unlocked buffer is not uptodate, there has
 208                          * been an IO error. Skip it.
 209                          */
 210                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 211                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 212                                 err = -EIO;
 213                                 continue;
 214                         }
 215
 216                         /* Don't write clean buffers.  Don't write ANY buffers
 217                          * on the third pass.
 218                          */
 219                         if (!buffer_dirty(bh) || pass >= 2)
 220                                 continue;
 221
 222                         /* Don't bother about locked buffers.
 223                          *
 224                          * XXX We checked if it was locked above and there is no
 225                          * XXX way we could have slept in between. -DaveM
 226                          */
 227                         if (buffer_locked(bh))
 228                                 continue;
 229                         bh->b_count++;
 230                         next->b_count++;
 231                         bh->b_flushtime = 0;
 232                         ll_rw_block(WRITE, 1, &bh);
 233                         bh->b_count--;
 234                         next->b_count--;
 235                         retry = 1;
 236                 }
 237
 238     repeat2:
 239                 bh = lru_list[BUF_LOCKED];
 240                 if (!bh)
 241                         break;
 242                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 243                         if (bh->b_list != BUF_LOCKED)
 244                                 goto repeat2;
 245                         next = bh->b_next_free;
 246                         if (!lru_list[BUF_LOCKED])
 247                                 break;
 248                         if (dev && bh->b_dev != dev)
 249                                 continue;
 250                         if (buffer_locked(bh)) {
 251                                 /* Buffer is locked; skip it unless wait is
 252                                  * requested AND pass > 0.
 253                                  */
 254                                 if (!wait || !pass) {
 255                                         retry = 1;
 256                                         continue;
 257                                 }
 258                                 wait_on_buffer (bh);
 259                                 goto repeat2;
 260                         }
 261                 }
 262
 263                 /* If we are waiting for the sync to succeed, and if any dirty
 264                  * blocks were written, then repeat; on the second pass, only
 265                  * wait for buffers being written (do not pass to write any
 266                  * more buffers on the second pass).
 267                  */
 268         } while (wait && retry && ++pass<=2);
 269         return err;
 270 }
 271
 272 void sync_dev(kdev_t dev)
 273 {
 274         sync_buffers(dev, 0);
 275         sync_supers(dev);
 276         sync_inodes(dev);
 277         sync_buffers(dev, 0);
 278         DQUOT_SYNC(dev);
 279         /*
 280          * FIXME(eric) we need to sync the physical devices here.
 281          * This is because some (scsi) controllers have huge amounts of
 282          * cache onboard (hundreds of Mb), and we need to instruct
 283          * them to commit all of the dirty memory to disk, and we should
 284          * not return until this has happened.
 285          *
 286          * This would need to get implemented by going through the assorted
 287          * layers so that each block major number can be synced, and this
 288          * would call down into the upper and mid-layer scsi.
 289          */
 290 }
 291
 292 int fsync_dev(kdev_t dev)
 293 {
 294         sync_buffers(dev, 0);
 295         sync_supers(dev);
 296         sync_inodes(dev);
 297         DQUOT_SYNC(dev);
 298         return sync_buffers(dev, 1);
 299 }
 300
 301 asmlinkage int sys_sync(void)
 302 {
 303         lock_kernel();
 304         fsync_dev(0);
 305         unlock_kernel();
 306         return 0;
 307 }
 308
 309 /*
 310  *      filp may be NULL if called via the msync of a vma.
 311  */
 312
 313 int file_fsync(struct file *filp, struct dentry *dentry)
 314 {
 315         struct inode * inode = dentry->d_inode;
 316         struct super_block * sb;
 317         kdev_t dev;
 318
 319         /* sync the inode to buffers */
 320         write_inode_now(inode);
 321
 322         /* sync the superblock to buffers */
 323         sb = inode->i_sb;
 324         wait_on_super(sb);
 325         if (sb->s_op && sb->s_op->write_super)
 326                 sb->s_op->write_super(sb);
 327
 328         /* .. finally sync the buffers to disk */
 329         dev = inode->i_dev;
 330         return sync_buffers(dev, 1);
 331 }
 332
 333 asmlinkage int sys_fsync(unsigned int fd)
 334 {
 335         struct file * file;
 336         struct dentry * dentry;
 337         struct inode * inode;
 338         int err;
 339
 340         lock_kernel();
 341         err = -EBADF;
 342         file = fget(fd);
 343         if (!file)
 344                 goto out;
 345
 346         dentry = file->f_dentry;
 347         if (!dentry)
 348                 goto out_putf;
 349
 350         inode = dentry->d_inode;
 351         if (!inode)
 352                 goto out_putf;
 353
 354         err = -EINVAL;
 355         if (!file->f_op || !file->f_op->fsync)
 356                 goto out_putf;
 357
 358         /* We need to protect against concurrent writers.. */
 359         down(&inode->i_sem);
 360         err = file->f_op->fsync(file, dentry);
 361         up(&inode->i_sem);
 362
 363 out_putf:
 364         fput(file);
 365 out:
 366         unlock_kernel();
 367         return err;
 368 }
 369
 370 asmlinkage int sys_fdatasync(unsigned int fd)
 371 {
 372         struct file * file;
 373         struct dentry * dentry;
 374         struct inode * inode;
 375         int err;
 376
 377         lock_kernel();
 378         err = -EBADF;
 379         file = fget(fd);
 380         if (!file)
 381                 goto out;
 382
 383         dentry = file->f_dentry;
 384         if (!dentry)
 385                 goto out_putf;
 386
 387         inode = dentry->d_inode;
 388         if (!inode)
 389                 goto out_putf;
 390
 391         err = -EINVAL;
 392         if (!file->f_op || !file->f_op->fsync)
 393                 goto out_putf;
 394
 395         /* this needs further work, at the moment it is identical to fsync() */
 396         err = file->f_op->fsync(file, dentry);
 397
 398 out_putf:
 399         fput(file);
 400 out:
 401         unlock_kernel();
 402         return err;
 403 }
 404
 405 void invalidate_buffers(kdev_t dev)
 406 {
 407         int i;
 408         int nlist;
 409         struct buffer_head * bh;
 410
 411         for(nlist = 0; nlist < NR_LIST; nlist++) {
 412                 bh = lru_list[nlist];
 413                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 414                         if (bh->b_dev != dev)
 415                                 continue;
 416                         wait_on_buffer(bh);
 417                         if (bh->b_dev != dev)
 418                                 continue;
 419                         if (bh->b_count)
 420                                 continue;
 421                         bh->b_flushtime = 0;
 422                         clear_bit(BH_Protected, &bh->b_state);
 423                         clear_bit(BH_Uptodate, &bh->b_state);
 424                         clear_bit(BH_Dirty, &bh->b_state);
 425                         clear_bit(BH_Req, &bh->b_state);
 426                 }
 427         }
 428 }
 429
 430 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
 431 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 432
 433 static inline void remove_from_hash_queue(struct buffer_head * bh)
 434 {
 435         if (bh->b_pprev) {
 436                 if(bh->b_next)
 437                         bh->b_next->b_pprev = bh->b_pprev;
 438                 *bh->b_pprev = bh->b_next;
 439                 bh->b_pprev = NULL;
 440         }
 441 }
 442
 443 static inline void remove_from_lru_list(struct buffer_head * bh)
 444 {
 445         if (!(bh->b_prev_free) || !(bh->b_next_free))
 446                 panic("VFS: LRU block list corrupted");
 447         if (bh->b_dev == B_FREE)
 448                 panic("LRU list corrupted");
 449         bh->b_prev_free->b_next_free = bh->b_next_free;
 450         bh->b_next_free->b_prev_free = bh->b_prev_free;
 451
 452         if (lru_list[bh->b_list] == bh)
 453                  lru_list[bh->b_list] = bh->b_next_free;
 454         if (lru_list[bh->b_list] == bh)
 455                  lru_list[bh->b_list] = NULL;
 456         bh->b_next_free = bh->b_prev_free = NULL;
 457 }
 458
 459 static inline void remove_from_free_list(struct buffer_head * bh)
 460 {
 461         int isize = BUFSIZE_INDEX(bh->b_size);
 462         if (!(bh->b_prev_free) || !(bh->b_next_free))
 463                 panic("VFS: Free block list corrupted");
 464         if(bh->b_dev != B_FREE)
 465                 panic("Free list corrupted");
 466         if(!free_list[isize])
 467                 panic("Free list empty");
 468         if(bh->b_next_free == bh)
 469                  free_list[isize] = NULL;
 470         else {
 471                 bh->b_prev_free->b_next_free = bh->b_next_free;
 472                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 473                 if (free_list[isize] == bh)
 474                          free_list[isize] = bh->b_next_free;
 475         }
 476         bh->b_next_free = bh->b_prev_free = NULL;
 477 }
 478
 479 static inline void remove_from_queues(struct buffer_head * bh)
 480 {
 481         if(bh->b_dev == B_FREE) {
 482                 remove_from_free_list(bh); /* Free list entries should not be
 483                                               in the hash queue */
 484                 return;
 485         }
 486         nr_buffers_type[bh->b_list]--;
 487         remove_from_hash_queue(bh);
 488         remove_from_lru_list(bh);
 489 }
 490
 491 static inline void put_last_lru(struct buffer_head * bh)
 492 {
 493         if (bh) {
 494                 struct buffer_head **bhp = &lru_list[bh->b_list];
 495
 496                 if (bh == *bhp) {
 497                         *bhp = bh->b_next_free;
 498                         return;
 499                 }
 500
 501                 if(bh->b_dev == B_FREE)
 502                         panic("Wrong block for lru list");
 503
 504                 /* Add to back of free list. */
 505                 remove_from_lru_list(bh);
 506                 if(!*bhp) {
 507                         *bhp = bh;
 508                         (*bhp)->b_prev_free = bh;
 509                 }
 510
 511                 bh->b_next_free = *bhp;
 512                 bh->b_prev_free = (*bhp)->b_prev_free;
 513                 (*bhp)->b_prev_free->b_next_free = bh;
 514                 (*bhp)->b_prev_free = bh;
 515         }
 516 }
 517
 518 static inline void put_last_free(struct buffer_head * bh)
 519 {
 520         if (bh) {
 521                 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 522
 523                 bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 524
 525                 /* Add to back of free list. */
 526                 if(!*bhp) {
 527                         *bhp = bh;
 528                         bh->b_prev_free = bh;
 529                 }
 530
 531                 bh->b_next_free = *bhp;
 532                 bh->b_prev_free = (*bhp)->b_prev_free;
 533                 (*bhp)->b_prev_free->b_next_free = bh;
 534                 (*bhp)->b_prev_free = bh;
 535         }
 536 }
 537
 538 static inline void insert_into_queues(struct buffer_head * bh)
 539 {
 540         /* put at end of free list */
 541         if(bh->b_dev == B_FREE) {
 542                 put_last_free(bh);
 543         } else {
 544                 struct buffer_head **bhp = &lru_list[bh->b_list];
 545
 546                 if(!*bhp) {
 547                         *bhp = bh;
 548                         bh->b_prev_free = bh;
 549                 }
 550
 551                 if (bh->b_next_free)
 552                         panic("VFS: buffer LRU pointers corrupted");
 553
 554                 bh->b_next_free = *bhp;
 555                 bh->b_prev_free = (*bhp)->b_prev_free;
 556                 (*bhp)->b_prev_free->b_next_free = bh;
 557                 (*bhp)->b_prev_free = bh;
 558
 559                 nr_buffers_type[bh->b_list]++;
 560
 561                 /* Put the buffer in new hash-queue if it has a device. */
 562                 if (bh->b_dev) {
 563                         struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
 564                         if((bh->b_next = *bhp) != NULL)
 565                                 (*bhp)->b_pprev = &bh->b_next;
 566                         *bhp = bh;
 567                         bh->b_pprev = bhp;      /* Exists in bh hashes. */
 568                 } else
 569                         bh->b_pprev = NULL;     /* Not in bh hashes. */
 570         }
 571 }
 572
 573 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 574 {
 575         struct buffer_head * next;
 576
 577         next = hash(dev,block);
 578         for (;;) {
 579                 struct buffer_head *tmp = next;
 580                 if (!next)
 581                         break;
 582                 next = tmp->b_next;
 583                 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
 584                         continue;
 585                 next = tmp;
 586                 break;
 587         }
 588         return next;
 589 }
 590
 591 /*
 592  * Why like this, I hear you say... The reason is race-conditions.
 593  * As we don't lock buffers (unless we are reading them, that is),
 594  * something might happen to it while we sleep (ie a read-error
 595  * will force it bad). This shouldn't really happen currently, but
 596  * the code is ready.
 597  */
 598 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 599 {
 600         struct buffer_head * bh;
 601         for (;;) {
 602                 bh = find_buffer(dev,block,size);
 603                 if (!bh)
 604                         break;
 605                 bh->b_count++;
 606                 bh->b_lru_time = jiffies;
 607                 if (!buffer_locked(bh))
 608                         break;
 609                 __wait_on_buffer(bh);
 610                 if (bh->b_dev == dev            &&
 611                     bh->b_blocknr == block      &&
 612                     bh->b_size == size)
 613                         break;
 614                 bh->b_count--;
 615         }
 616         return bh;
 617 }
 618
 619 unsigned int get_hardblocksize(kdev_t dev)
 620 {
 621         /*
 622          * Get the hard sector size for the given device.  If we don't know
 623          * what it is, return 0.
 624          */
 625         if (hardsect_size[MAJOR(dev)] != NULL) {
 626                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 627                 if (blksize != 0)
 628                         return blksize;
 629         }
 630
 631         /*
 632          * We don't know what the hardware sector size for this device is.
 633          * Return 0 indicating that we don't know.
 634          */
 635         return 0;
 636 }
 637
 638 void set_blocksize(kdev_t dev, int size)
 639 {
 640         extern int *blksize_size[];
 641         int i, nlist;
 642         struct buffer_head * bh, *bhnext;
 643
 644         if (!blksize_size[MAJOR(dev)])
 645                 return;
 646
 647         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 648         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 649                 panic("Invalid blocksize passed to set_blocksize");
 650
 651         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 652                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 653                 return;
 654         }
 655         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 656                 return;
 657         sync_buffers(dev, 2);
 658         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 659
 660         /* We need to be quite careful how we do this - we are moving entries
 661          * around on the free list, and we can get in a loop if we are not careful.
 662          */
 663         for(nlist = 0; nlist < NR_LIST; nlist++) {
 664                 bh = lru_list[nlist];
 665                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 666                         if(!bh)
 667                                 break;
 668
 669                         bhnext = bh->b_next_free;
 670                         if (bh->b_dev != dev)
 671                                  continue;
 672                         if (bh->b_size == size)
 673                                  continue;
 674                         bhnext->b_count++;
 675                         wait_on_buffer(bh);
 676                         bhnext->b_count--;
 677                         if (bh->b_dev == dev && bh->b_size != size) {
 678                                 clear_bit(BH_Dirty, &bh->b_state);
 679                                 clear_bit(BH_Uptodate, &bh->b_state);
 680                                 clear_bit(BH_Req, &bh->b_state);
 681                                 bh->b_flushtime = 0;
 682                         }
 683                         remove_from_hash_queue(bh);
 684                 }
 685         }
 686 }
 687
 688 /*
 689  * Find a candidate buffer to be reclaimed.
 690  * N.B. Must search the entire BUF_LOCKED list rather than terminating
 691  * when the first locked buffer is found.  Buffers are unlocked at
 692  * completion of IO, and under some conditions there may be (many)
 693  * unlocked buffers after the first locked one.
 694  */
 695 static struct buffer_head *find_candidate(struct buffer_head *bh,
 696                                           int *list_len, int size)
 697 {
 698         if (!bh)
 699                 goto no_candidate;
 700
 701         for (; (*list_len) > 0; bh = bh->b_next_free, (*list_len)--) {
 702                 if (size != bh->b_size) {
 703                         /* This provides a mechanism for freeing blocks
 704                          * of other sizes, this is necessary now that we
 705                          * no longer have the lav code.
 706                          */
 707                         try_to_free_buffer(bh,&bh,1);
 708                         if (!bh)
 709                                 break;
 710                         continue;
 711                 }
 712                 else if (!bh->b_count           &&
 713                          !buffer_locked(bh)     &&
 714                          !buffer_protected(bh)  &&
 715                          !buffer_dirty(bh))
 716                         return bh;
 717         }
 718
 719 no_candidate:
 720         return NULL;
 721 }
 722
 723 static void refill_freelist(int size)
 724 {
 725         struct buffer_head * bh, * next;
 726         struct buffer_head * candidate[BUF_DIRTY];
 727         int buffers[BUF_DIRTY];
 728         int i;
 729         int needed, obtained=0;
 730
 731         refilled = 1;
 732
 733         /* We are going to try to locate this much memory. */
 734         needed = bdf_prm.b_un.nrefill * size;
 735
 736         while ((nr_free_pages > freepages.min*2) &&
 737                 (buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.max_percent * num_physpages) &&
 738                 grow_buffers(GFP_BUFFER, size)) {
 739                 obtained += PAGE_SIZE;
 740                 if (obtained >= needed)
 741                         return;
 742         }
 743
 744         /*
 745          * Update the needed amount based on the number of potentially
 746          * freeable buffers. We don't want to free more than one quarter
 747          * of the available buffers.
 748          */
 749         i = (nr_buffers_type[BUF_CLEAN] + nr_buffers_type[BUF_LOCKED]) >> 2;
 750         if (i < bdf_prm.b_un.nrefill) {
 751                 needed = i * size;
 752                 if (needed < PAGE_SIZE)
 753                         needed = PAGE_SIZE;
 754         }
 755
 756         /*
 757          * OK, we cannot grow the buffer cache, now try to get some
 758          * from the lru list.
 759          */
 760 repeat:
 761         if (obtained >= needed)
 762                 return;
 763
 764         /*
 765          * First set the candidate pointers to usable buffers.  This
 766          * should be quick nearly all of the time.  N.B. There must be
 767          * no blocking calls after setting up the candidate[] array!
 768          */
 769         for (i = BUF_CLEAN; i<BUF_DIRTY; i++) {
 770                 buffers[i] = nr_buffers_type[i];
 771                 candidate[i] = find_candidate(lru_list[i], &buffers[i], size);
 772         }
 773
 774         /*
 775          * Select the older of the available buffers until we reach our goal.
 776          */
 777         for (;;) {
 778                 i = BUF_CLEAN;
 779                 if (!candidate[BUF_CLEAN]) {
 780                         if (!candidate[BUF_LOCKED])
 781                                 break;
 782                         i = BUF_LOCKED;
 783                 }
 784                 else if (candidate[BUF_LOCKED] &&
 785                                 (candidate[BUF_LOCKED]->b_lru_time <
 786                                  candidate[BUF_CLEAN ]->b_lru_time))
 787                         i = BUF_LOCKED;
 788                 /*
 789                  * Free the selected buffer and get the next candidate.
 790                  */
 791                 bh = candidate[i];
 792                 next = bh->b_next_free;
 793
 794                 obtained += bh->b_size;
 795                 remove_from_queues(bh);
 796                 put_last_free(bh);
 797                 if (obtained >= needed)
 798                         return;
 799
 800                 if (--buffers[i] && bh != next)
 801                         candidate[i] = find_candidate(next, &buffers[i], size);
 802                 else
 803                         candidate[i] = NULL;
 804         }
 805
 806         /*
 807          * If there are dirty buffers, do a non-blocking wake-up.
 808          * This increases the chances of having buffers available
 809          * for the next call ...
 810          */
 811         if (nr_buffers_type[BUF_DIRTY])
 812                 wakeup_bdflush(0);
 813
 814         /*
 815          * Allocate buffers to reach half our goal, if possible.
 816          * Since the allocation doesn't block, there's no reason
 817          * to search the buffer lists again. Then return if there
 818          * are _any_ free buffers.
 819          */
 820         while (obtained < (needed >> 1) &&
 821                nr_free_pages > freepages.min + 5 &&
 822                grow_buffers(GFP_BUFFER, size))
 823                 obtained += PAGE_SIZE;
 824
 825         if (free_list[BUFSIZE_INDEX(size)])
 826                 return;
 827
 828         /*
 829          * If there are dirty buffers, wait while bdflush writes
 830          * them out. The buffers become locked, but we can just
 831          * wait for one to unlock ...
 832          */
 833         if (nr_buffers_type[BUF_DIRTY])
 834                 wakeup_bdflush(1);
 835
 836         /*
 837          * In order to prevent a buffer shortage from exhausting
 838          * the system's reserved pages, we force tasks to wait
 839          * before using reserved pages for buffers.  This is easily
 840          * accomplished by waiting on an unused locked buffer.
 841          */
 842         if ((bh = lru_list[BUF_LOCKED]) != NULL) {
 843                 for (i = nr_buffers_type[BUF_LOCKED]; i--; bh = bh->b_next_free)
 844                 {
 845                         if (bh->b_size != size)
 846                                 continue;
 847                         if (bh->b_count)
 848                                 continue;
 849                         if (!buffer_locked(bh))
 850                                 continue;
 851                         if (buffer_dirty(bh) || buffer_protected(bh))
 852                                 continue;
 853                         if (MAJOR(bh->b_dev) == LOOP_MAJOR)
 854                                 continue;
 855                         /*
 856                          * We've found an unused, locked, non-dirty buffer of
 857                          * the correct size.  Claim it so no one else can,
 858                          * then wait for it to unlock.
 859                          */
 860                         bh->b_count++;
 861                         wait_on_buffer(bh);
 862                         bh->b_count--;
 863                         /*
 864                          * Loop back to harvest this (and maybe other) buffers.
 865                          */
 866                         goto repeat;
 867                 }
 868         }
 869
 870         /*
 871          * Convert a reserved page into buffers ... should happen only rarely.
 872          */
 873         if (grow_buffers(GFP_ATOMIC, size)) {
 874 #ifdef BUFFER_DEBUG
 875 printk("refill_freelist: used reserve page\n");
 876 #endif
 877                 return;
 878         }
 879
 880         /*
 881          * System is _very_ low on memory ... sleep and try later.
 882          */
 883 #ifdef BUFFER_DEBUG
 884 printk("refill_freelist: task %s waiting for buffers\n", current->comm);
 885 #endif
 886         schedule();
 887         goto repeat;
 888 }
 889
 890 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 891                  bh_end_io_t *handler, void *dev_id)
 892 {
 893         bh->b_count = 1;
 894         bh->b_list = BUF_CLEAN;
 895         bh->b_flushtime = 0;
 896         bh->b_dev = dev;
 897         bh->b_blocknr = block;
 898         bh->b_end_io = handler;
 899         bh->b_dev_id = dev_id;
 900 }
 901
 902 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 903 {
 904         mark_buffer_uptodate(bh, uptodate);
 905         unlock_buffer(bh);
 906 }
 907
 908 /*
 909  * Ok, this is getblk, and it isn't very clear, again to hinder
 910  * race-conditions. Most of the code is seldom used, (ie repeating),
 911  * so it should be much more efficient than it looks.
 912  *
 913  * The algorithm is changed: hopefully better, and an elusive bug removed.
 914  *
 915  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 916  * when the filesystem starts to get full of dirty blocks (I hope).
 917  */
 918 struct buffer_head * getblk(kdev_t dev, int block, int size)
 919 {
 920         struct buffer_head * bh;
 921         int isize;
 922
 923 repeat:
 924         bh = get_hash_table(dev, block, size);
 925         if (bh) {
 926                 if (!buffer_dirty(bh)) {
 927                         if (buffer_uptodate(bh))
 928                                  put_last_lru(bh);
 929                         bh->b_flushtime = 0;
 930                 }
 931                 set_bit(BH_Touched, &bh->b_state);
 932                 return bh;
 933         }
 934
 935         isize = BUFSIZE_INDEX(size);
 936 get_free:
 937         bh = free_list[isize];
 938         if (!bh)
 939                 goto refill;
 940         remove_from_free_list(bh);
 941
 942         /* OK, FINALLY we know that this buffer is the only one of its kind,
 943          * and that it's unused (b_count=0), unlocked, and clean.
 944          */
 945         init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
 946         bh->b_lru_time  = jiffies;
 947         bh->b_state=(1<<BH_Touched);
 948         insert_into_queues(bh);
 949         return bh;
 950
 951         /*
 952          * If we block while refilling the free list, somebody may
 953          * create the buffer first ... search the hashes again.
 954          */
 955 refill:
 956         refill_freelist(size);
 957         if (!find_buffer(dev,block,size))
 958                 goto get_free;
 959         goto repeat;
 960 }
 961
 962 void set_writetime(struct buffer_head * buf, int flag)
 963 {
 964         int newtime;
 965
 966         if (buffer_dirty(buf)) {
 967                 /* Move buffer to dirty list if jiffies is clear. */
 968                 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
 969                                      bdf_prm.b_un.age_buffer);
 970                 if(!buf->b_flushtime || buf->b_flushtime > newtime)
 971                          buf->b_flushtime = newtime;
 972         } else {
 973                 buf->b_flushtime = 0;
 974         }
 975 }
 976
 977
 978 /*
 979  * Put a buffer into the appropriate list, without side-effects.
 980  */
 981 static inline void file_buffer(struct buffer_head *bh, int list)
 982 {
 983         remove_from_queues(bh);
 984         bh->b_list = list;
 985         insert_into_queues(bh);
 986 }
 987
 988 /*
 989  * A buffer may need to be moved from one buffer list to another
 990  * (e.g. in case it is not shared any more). Handle this.
 991  */
 992 void refile_buffer(struct buffer_head * buf)
 993 {
 994         int dispose;
 995
 996         if(buf->b_dev == B_FREE) {
 997                 printk("Attempt to refile free buffer\n");
 998                 return;
 999         }
1000         if (buffer_dirty(buf))
1001                 dispose = BUF_DIRTY;
1002         else if (buffer_locked(buf))
1003                 dispose = BUF_LOCKED;
1004         else
1005                 dispose = BUF_CLEAN;
1006         if(dispose != buf->b_list) {
1007                 file_buffer(buf, dispose);
1008                 if(dispose == BUF_DIRTY) {
1009                         int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
1010
1011                         /* This buffer is dirty, maybe we need to start flushing.
1012                          * If too high a percentage of the buffers are dirty...
1013                          */
1014                         if (nr_buffers_type[BUF_DIRTY] > too_many)
1015                                 wakeup_bdflush(0);
1016
1017                         /* If this is a loop device, and
1018                          * more than half of the buffers are dirty...
1019                          * (Prevents no-free-buffers deadlock with loop device.)
1020                          */
1021                         if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
1022                             nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
1023                                 wakeup_bdflush(1);
1024                 }
1025         }
1026 }
1027
1028 /*
1029  * Release a buffer head
1030  */
1031 void __brelse(struct buffer_head * buf)
1032 {
1033         wait_on_buffer(buf);
1034
1035         /* If dirty, mark the time this buffer should be written back. */
1036         set_writetime(buf, 0);
1037         refile_buffer(buf);
1038
1039         if (buf->b_count) {
1040                 buf->b_count--;
1041                 return;
1042         }
1043         printk("VFS: brelse: Trying to free free buffer\n");
1044 }
1045
1046 /*
1047  * bforget() is like brelse(), except it removes the buffer
1048  * from the hash-queues (so that it won't be re-used if it's
1049  * shared).
1050  */
1051 void __bforget(struct buffer_head * buf)
1052 {
1053         wait_on_buffer(buf);
1054         mark_buffer_clean(buf);
1055         clear_bit(BH_Protected, &buf->b_state);
1056         remove_from_hash_queue(buf);
1057         buf->b_dev = NODEV;
1058         refile_buffer(buf);
1059         if (!--buf->b_count)
1060                 return;
1061         printk("VFS: forgot an in-use buffer! (count=%d)\n",
1062                 buf->b_count);
1063 }
1064
1065 /*
1066  * bread() reads a specified block and returns the buffer that contains
1067  * it. It returns NULL if the block was unreadable.
1068  */
1069 struct buffer_head * bread(kdev_t dev, int block, int size)
1070 {
1071         struct buffer_head * bh = getblk(dev, block, size);
1072
1073         if (bh) {
1074                 if (buffer_uptodate(bh))
1075                         return bh;
1076                 ll_rw_block(READ, 1, &bh);
1077                 wait_on_buffer(bh);
1078                 if (buffer_uptodate(bh))
1079                         return bh;
1080                 brelse(bh);
1081                 return NULL;
1082         }
1083         printk("VFS: bread: impossible error\n");
1084         return NULL;
1085 }
1086
1087 /*
1088  * Ok, breada can be used as bread, but additionally to mark other
1089  * blocks for reading as well. End the argument list with a negative
1090  * number.
1091  */
1092
1093 #define NBUF 16
1094
1095 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1096         unsigned int pos, unsigned int filesize)
1097 {
1098         struct buffer_head * bhlist[NBUF];
1099         unsigned int blocks;
1100         struct buffer_head * bh;
1101         int index;
1102         int i, j;
1103
1104         if (pos >= filesize)
1105                 return NULL;
1106
1107         if (block < 0 || !(bh = getblk(dev,block,bufsize)))
1108                 return NULL;
1109
1110         index = BUFSIZE_INDEX(bh->b_size);
1111
1112         if (buffer_uptodate(bh))
1113                 return(bh);
1114         else ll_rw_block(READ, 1, &bh);
1115
1116         blocks = (filesize - pos) >> (9+index);
1117
1118         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1119                 blocks = read_ahead[MAJOR(dev)] >> index;
1120         if (blocks > NBUF)
1121                 blocks = NBUF;
1122
1123 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1124
1125
1126         bhlist[0] = bh;
1127         j = 1;
1128         for(i=1; i<blocks; i++) {
1129                 bh = getblk(dev,block+i,bufsize);
1130                 if (buffer_uptodate(bh)) {
1131                         brelse(bh);
1132                         break;
1133                 }
1134                 else bhlist[j++] = bh;
1135         }
1136
1137         /* Request the read for these buffers, and then release them. */
1138         if (j>1)
1139                 ll_rw_block(READA, (j-1), bhlist+1);
1140         for(i=1; i<j; i++)
1141                 brelse(bhlist[i]);
1142
1143         /* Wait for this buffer, and then continue on. */
1144         bh = bhlist[0];
1145         wait_on_buffer(bh);
1146         if (buffer_uptodate(bh))
1147                 return bh;
1148         brelse(bh);
1149         return NULL;
1150 }
1151
1152 /*
1153  * Note: the caller should wake up the buffer_wait list if needed.
1154  */
1155 static void put_unused_buffer_head(struct buffer_head * bh)
1156 {
1157         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1158                 nr_buffer_heads--;
1159                 kmem_cache_free(bh_cachep, bh);
1160                 return;
1161         }
1162
1163         memset(bh,0,sizeof(*bh));
1164         nr_unused_buffer_heads++;
1165         bh->b_next_free = unused_list;
1166         unused_list = bh;
1167 }
1168
1169 /*
1170  * We can't put completed temporary IO buffer_heads directly onto the
1171  * unused_list when they become unlocked, since the device driver
1172  * end_request routines still expect access to the buffer_head's
1173  * fields after the final unlock.  So, the device driver puts them on
1174  * the reuse_list instead once IO completes, and we recover these to
1175  * the unused_list here.
1176  *
1177  * Note that we don't do a wakeup here, but return a flag indicating
1178  * whether we got any buffer heads. A task ready to sleep can check
1179  * the returned value, and any tasks already sleeping will have been
1180  * awakened when the buffer heads were added to the reuse list.
1181  */
1182 static inline int recover_reusable_buffer_heads(void)
1183 {
1184         struct buffer_head *head = xchg(&reuse_list, NULL);
1185         int found = 0;
1186
1187         if (head) {
1188                 do {
1189                         struct buffer_head *bh = head;
1190                         head = head->b_next_free;
1191                         put_unused_buffer_head(bh);
1192                 } while (head);
1193                 found = 1;
1194         }
1195         return found;
1196 }
1197
1198 /*
1199  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1200  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1201  * buffer heads is now handled in create_buffers().
1202  */
1203 static struct buffer_head * get_unused_buffer_head(int async)
1204 {
1205         struct buffer_head * bh;
1206
1207         recover_reusable_buffer_heads();
1208         if (nr_unused_buffer_heads > NR_RESERVED) {
1209                 bh = unused_list;
1210                 unused_list = bh->b_next_free;
1211                 nr_unused_buffer_heads--;
1212                 return bh;
1213         }
1214
1215         /* This is critical.  We can't swap out pages to get
1216          * more buffer heads, because the swap-out may need
1217          * more buffer-heads itself.  Thus SLAB_ATOMIC.
1218          */
1219         if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
1220                 memset(bh, 0, sizeof(*bh));
1221                 nr_buffer_heads++;
1222                 return bh;
1223         }
1224
1225         /*
1226          * If we need an async buffer, use the reserved buffer heads.
1227          */
1228         if (async && unused_list) {
1229                 bh = unused_list;
1230                 unused_list = bh->b_next_free;
1231                 nr_unused_buffer_heads--;
1232                 return bh;
1233         }
1234
1235 #if 0
1236         /*
1237          * (Pending further analysis ...)
1238          * Ordinary (non-async) requests can use a different memory priority
1239          * to free up pages. Any swapping thus generated will use async
1240          * buffer heads.
1241          */
1242         if(!async &&
1243            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1244                 memset(bh, 0, sizeof(*bh));
1245                 nr_buffer_heads++;
1246                 return bh;
1247         }
1248 #endif
1249
1250         return NULL;
1251 }
1252
1253 /*
1254  * Create the appropriate buffers when given a page for data area and
1255  * the size of each buffer.. Use the bh->b_this_page linked list to
1256  * follow the buffers created.  Return NULL if unable to create more
1257  * buffers.
1258  * The async flag is used to differentiate async IO (paging, swapping)
1259  * from ordinary buffer allocations, and only async requests are allowed
1260  * to sleep waiting for buffer heads.
1261  */
1262 static struct buffer_head * create_buffers(unsigned long page,
1263                                                 unsigned long size, int async)
1264 {
1265         struct wait_queue wait = { current, NULL };
1266         struct buffer_head *bh, *head;
1267         long offset;
1268
1269 try_again:
1270         head = NULL;
1271         offset = PAGE_SIZE;
1272         while ((offset -= size) >= 0) {
1273                 bh = get_unused_buffer_head(async);
1274                 if (!bh)
1275                         goto no_grow;
1276
1277                 bh->b_dev = B_FREE;  /* Flag as unused */
1278                 bh->b_this_page = head;
1279                 head = bh;
1280
1281                 bh->b_state = 0;
1282                 bh->b_next_free = NULL;
1283                 bh->b_count = 0;
1284                 bh->b_size = size;
1285
1286                 bh->b_data = (char *) (page+offset);
1287                 bh->b_list = 0;
1288         }
1289         return head;
1290 /*
1291  * In case anything failed, we just free everything we got.
1292  */
1293 no_grow:
1294         if (head) {
1295                 do {
1296                         bh = head;
1297                         head = head->b_this_page;
1298                         put_unused_buffer_head(bh);
1299                 } while (head);
1300
1301                 /* Wake up any waiters ... */
1302                 wake_up(&buffer_wait);
1303         }
1304
1305         /*
1306          * Return failure for non-async IO requests.  Async IO requests
1307          * are not allowed to fail, so we have to wait until buffer heads
1308          * become available.  But we don't want tasks sleeping with
1309          * partially complete buffers, so all were released above.
1310          */
1311         if (!async)
1312                 return NULL;
1313
1314         /* We're _really_ low on memory. Now we just
1315          * wait for old buffer heads to become free due to
1316          * finishing IO.  Since this is an async request and
1317          * the reserve list is empty, we're sure there are
1318          * async buffer heads in use.
1319          */
1320         run_task_queue(&tq_disk);
1321
1322         /*
1323          * Set our state for sleeping, then check again for buffer heads.
1324          * This ensures we won't miss a wake_up from an interrupt.
1325          */
1326         add_wait_queue(&buffer_wait, &wait);
1327         current->state = TASK_UNINTERRUPTIBLE;
1328         if (!recover_reusable_buffer_heads())
1329                 schedule();
1330         remove_wait_queue(&buffer_wait, &wait);
1331         current->state = TASK_RUNNING;
1332         goto try_again;
1333 }
1334
1335 /* Run the hooks that have to be done when a page I/O has completed. */
1336 static inline void after_unlock_page (struct page * page)
1337 {
1338         if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1339                 atomic_dec(&nr_async_pages);
1340 #ifdef DEBUG_SWAP
1341                 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1342                         (char *) page_address(page),
1343                         atomic_read(&nr_async_pages));
1344 #endif
1345         }
1346         if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1347                 swap_after_unlock_page(page->offset);
1348         if (test_and_clear_bit(PG_free_after, &page->flags))
1349                 __free_page(page);
1350 }
1351
1352 /*
1353  * Free all temporary buffers belonging to a page.
1354  * This needs to be called with interrupts disabled.
1355  */
1356 static inline void free_async_buffers (struct buffer_head * bh)
1357 {
1358         struct buffer_head *tmp, *tail;
1359
1360         /*
1361          * Link all the buffers into the b_next_free list,
1362          * so we only have to do one xchg() operation ...
1363          */
1364         tail = bh;
1365         while ((tmp = tail->b_this_page) != bh) {
1366                 tail->b_next_free = tmp;
1367                 tail = tmp;
1368         };
1369
1370         /* Update the reuse list */
1371         tail->b_next_free = xchg(&reuse_list, NULL);
1372         reuse_list = bh;
1373
1374         /* Wake up any waiters ... */
1375         wake_up(&buffer_wait);
1376 }
1377
1378 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1379 {
1380         unsigned long flags;
1381         struct buffer_head *tmp;
1382         struct page *page;
1383
1384         mark_buffer_uptodate(bh, uptodate);
1385         unlock_buffer(bh);
1386
1387         /* This is a temporary buffer used for page I/O. */
1388         page = mem_map + MAP_NR(bh->b_data);
1389         if (!PageLocked(page))
1390                 goto not_locked;
1391         if (bh->b_count != 1)
1392                 goto bad_count;
1393
1394         if (!test_bit(BH_Uptodate, &bh->b_state))
1395                 set_bit(PG_error, &page->flags);
1396
1397         /*
1398          * Be _very_ careful from here on. Bad things can happen if
1399          * two buffer heads end IO at almost the same time and both
1400          * decide that the page is now completely done.
1401          *
1402          * Async buffer_heads are here only as labels for IO, and get
1403          * thrown away once the IO for this page is complete.  IO is
1404          * deemed complete once all buffers have been visited
1405          * (b_count==0) and are now unlocked. We must make sure that
1406          * only the _last_ buffer that decrements its count is the one
1407          * that free's the page..
1408          */
1409         save_flags(flags);
1410         cli();
1411         bh->b_count--;
1412         tmp = bh;
1413         do {
1414                 if (tmp->b_count)
1415                         goto still_busy;
1416                 tmp = tmp->b_this_page;
1417         } while (tmp != bh);
1418
1419         /* OK, the async IO on this page is complete. */
1420         free_async_buffers(bh);
1421         restore_flags(flags);
1422         clear_bit(PG_locked, &page->flags);
1423         wake_up(&page->wait);
1424         after_unlock_page(page);
1425         return;
1426
1427 still_busy:
1428         restore_flags(flags);
1429         return;
1430
1431 not_locked:
1432         printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1433         return;
1434
1435 bad_count:
1436         printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1437         return;
1438 }
1439
1440 /*
1441  * Start I/O on a page.
1442  * This function expects the page to be locked and may return before I/O is complete.
1443  * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1444  */
1445 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1446 {
1447         struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1448         int block, nr;
1449
1450         if (!PageLocked(page))
1451                 panic("brw_page: page not locked for I/O");
1452         clear_bit(PG_uptodate, &page->flags);
1453         clear_bit(PG_error, &page->flags);
1454         /*
1455          * Allocate async buffer heads pointing to this page, just for I/O.
1456          * They do _not_ show up in the buffer hash table!
1457          * They are _not_ registered in page->buffers either!
1458          */
1459         bh = create_buffers(page_address(page), size, 1);
1460         if (!bh) {
1461                 /* WSH: exit here leaves page->count incremented */
1462                 clear_bit(PG_locked, &page->flags);
1463                 wake_up(&page->wait);
1464                 return -ENOMEM;
1465         }
1466         nr = 0;
1467         next = bh;
1468         do {
1469                 struct buffer_head * tmp;
1470                 block = *(b++);
1471
1472                 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1473                 set_bit(BH_Uptodate, &next->b_state);
1474
1475                 /*
1476                  * When we use bmap, we define block zero to represent
1477                  * a hole.  ll_rw_page, however, may legitimately
1478                  * access block zero, and we need to distinguish the
1479                  * two cases.
1480                  */
1481                 if (bmap && !block) {
1482                         memset(next->b_data, 0, size);
1483                         next->b_count--;
1484                         continue;
1485                 }
1486                 tmp = get_hash_table(dev, block, size);
1487                 if (tmp) {
1488                         if (!buffer_uptodate(tmp)) {
1489                                 if (rw == READ)
1490                                         ll_rw_block(READ, 1, &tmp);
1491                                 wait_on_buffer(tmp);
1492                         }
1493                         if (rw == READ)
1494                                 memcpy(next->b_data, tmp->b_data, size);
1495                         else {
1496                                 memcpy(tmp->b_data, next->b_data, size);
1497                                 mark_buffer_dirty(tmp, 0);
1498                         }
1499                         brelse(tmp);
1500                         next->b_count--;
1501                         continue;
1502                 }
1503                 if (rw == READ)
1504                         clear_bit(BH_Uptodate, &next->b_state);
1505                 else
1506                         set_bit(BH_Dirty, &next->b_state);
1507                 arr[nr++] = next;
1508         } while (prev = next, (next = next->b_this_page) != NULL);
1509         prev->b_this_page = bh;
1510
1511         if (nr) {
1512                 ll_rw_block(rw, nr, arr);
1513                 /* The rest of the work is done in mark_buffer_uptodate()
1514                  * and unlock_buffer(). */
1515         } else {
1516                 unsigned long flags;
1517                 clear_bit(PG_locked, &page->flags);
1518                 set_bit(PG_uptodate, &page->flags);
1519                 wake_up(&page->wait);
1520                 save_flags(flags);
1521                 cli();
1522                 free_async_buffers(bh);
1523                 restore_flags(flags);
1524                 after_unlock_page(page);
1525         }
1526         ++current->maj_flt;
1527         return 0;
1528 }
1529
1530 /*
1531  * This is called by end_request() when I/O has completed.
1532  */
1533 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1534 {
1535         if (on) {
1536                 struct buffer_head *tmp = bh;
1537                 set_bit(BH_Uptodate, &bh->b_state);
1538                 /* If a page has buffers and all these buffers are uptodate,
1539                  * then the page is uptodate. */
1540                 do {
1541                         if (!test_bit(BH_Uptodate, &tmp->b_state))
1542                                 return;
1543                         tmp=tmp->b_this_page;
1544                 } while (tmp && tmp != bh);
1545                 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1546                 return;
1547         }
1548         clear_bit(BH_Uptodate, &bh->b_state);
1549 }
1550
1551 /*
1552  * Generic "readpage" function for block devices that have the normal
1553  * bmap functionality. This is most of the block device filesystems.
1554  * Reads the page asynchronously --- the unlock_buffer() and
1555  * mark_buffer_uptodate() functions propagate buffer state into the
1556  * page struct once IO has completed.
1557  */
1558 int generic_readpage(struct file * file, struct page * page)
1559 {
1560         struct dentry *dentry = file->f_dentry;
1561         struct inode *inode = dentry->d_inode;
1562         unsigned long block;
1563         int *p, nr[PAGE_SIZE/512];
1564         int i;
1565
1566         atomic_inc(&page->count);
1567         set_bit(PG_locked, &page->flags);
1568         set_bit(PG_free_after, &page->flags);
1569
1570         i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1571         block = page->offset >> inode->i_sb->s_blocksize_bits;
1572         p = nr;
1573         do {
1574                 *p = inode->i_op->bmap(inode, block);
1575                 i--;
1576                 block++;
1577                 p++;
1578         } while (i > 0);
1579
1580         /* IO start */
1581         brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1582         return 0;
1583 }
1584
1585 /*
1586  * Try to increase the number of buffers available: the size argument
1587  * is used to determine what kind of buffers we want.
1588  */
1589 static int grow_buffers(int pri, int size)
1590 {
1591         unsigned long page;
1592         struct buffer_head *bh, *tmp;
1593         struct buffer_head * insert_point;
1594         int isize;
1595
1596         if ((size & 511) || (size > PAGE_SIZE)) {
1597                 printk("VFS: grow_buffers: size = %d\n",size);
1598                 return 0;
1599         }
1600
1601         if (!(page = __get_free_page(pri)))
1602                 return 0;
1603         bh = create_buffers(page, size, 0);
1604         if (!bh) {
1605                 free_page(page);
1606                 return 0;
1607         }
1608
1609         isize = BUFSIZE_INDEX(size);
1610         insert_point = free_list[isize];
1611
1612         tmp = bh;
1613         while (1) {
1614                 if (insert_point) {
1615                         tmp->b_next_free = insert_point->b_next_free;
1616                         tmp->b_prev_free = insert_point;
1617                         insert_point->b_next_free->b_prev_free = tmp;
1618                         insert_point->b_next_free = tmp;
1619                 } else {
1620                         tmp->b_prev_free = tmp;
1621                         tmp->b_next_free = tmp;
1622                 }
1623                 insert_point = tmp;
1624                 ++nr_buffers;
1625                 if (tmp->b_this_page)
1626                         tmp = tmp->b_this_page;
1627                 else
1628                         break;
1629         }
1630         tmp->b_this_page = bh;
1631         free_list[isize] = bh;
1632         mem_map[MAP_NR(page)].buffers = bh;
1633         buffermem += PAGE_SIZE;
1634         return 1;
1635 }
1636
1637
1638 /* =========== Reduce the buffer memory ============= */
1639
1640 static inline int buffer_waiting(struct buffer_head * bh)
1641 {
1642         return waitqueue_active(&bh->b_wait);
1643 }
1644
1645 /*
1646  * try_to_free_buffer() checks if all the buffers on this particular page
1647  * are unused, and free's the page if so.
1648  */
1649 int try_to_free_buffer(struct buffer_head * bh, struct buffer_head ** bhp,
1650                        int priority)
1651 {
1652         unsigned long page;
1653         struct buffer_head * tmp, * p;
1654
1655         *bhp = bh;
1656         page = (unsigned long) bh->b_data;
1657         page &= PAGE_MASK;
1658         tmp = bh;
1659         do {
1660                 if (!tmp)
1661                         return 0;
1662                 if (tmp->b_count || buffer_protected(tmp) ||
1663                     buffer_dirty(tmp) || buffer_locked(tmp) ||
1664                     buffer_waiting(tmp))
1665                         return 0;
1666                 if (priority && buffer_touched(tmp))
1667                         return 0;
1668                 tmp = tmp->b_this_page;
1669         } while (tmp != bh);
1670
1671         tmp = bh;
1672         do {
1673                 p = tmp;
1674                 tmp = tmp->b_this_page;
1675                 nr_buffers--;
1676                 if (p == *bhp) {
1677                         *bhp = p->b_prev_free;
1678                         if (p == *bhp) /* Was this the last in the list? */
1679                                 *bhp = NULL;
1680                 }
1681                 remove_from_queues(p);
1682                 put_unused_buffer_head(p);
1683         } while (tmp != bh);
1684         /* Wake up anyone waiting for buffer heads */
1685         wake_up(&buffer_wait);
1686
1687         buffermem -= PAGE_SIZE;
1688         mem_map[MAP_NR(page)].buffers = NULL;
1689         free_page(page);
1690         return 1;
1691 }
1692
1693 /* ================== Debugging =================== */
1694
1695 void show_buffers(void)
1696 {
1697         struct buffer_head * bh;
1698         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1699         int protected = 0;
1700         int nlist;
1701         static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1702
1703         printk("Buffer memory:   %6dkB\n",buffermem>>10);
1704         printk("Buffer heads:    %6d\n",nr_buffer_heads);
1705         printk("Buffer blocks:   %6d\n",nr_buffers);
1706
1707         for(nlist = 0; nlist < NR_LIST; nlist++) {
1708           found = locked = dirty = used = lastused = protected = 0;
1709           bh = lru_list[nlist];
1710           if(!bh) continue;
1711
1712           do {
1713                 found++;
1714                 if (buffer_locked(bh))
1715                         locked++;
1716                 if (buffer_protected(bh))
1717                         protected++;
1718                 if (buffer_dirty(bh))
1719                         dirty++;
1720                 if (bh->b_count)
1721                         used++, lastused = found;
1722                 bh = bh->b_next_free;
1723           } while (bh != lru_list[nlist]);
1724           printk("%8s: %d buffers, %d used (last=%d), "
1725                  "%d locked, %d protected, %d dirty\n",
1726                  buf_types[nlist], found, used, lastused,
1727                  locked, protected, dirty);
1728         };
1729 }
1730
1731
1732 /* ===================== Init ======================= */
1733
1734 /*
1735  * allocate the hash table and init the free list
1736  * Use gfp() for the hash table to decrease TLB misses, use
1737  * SLAB cache for buffer heads.
1738  */
1739 void __init buffer_init(void)
1740 {
1741         int order = 5;          /* Currently maximum order.. */
1742         unsigned int nr_hash;
1743
1744         nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct buffer_head *);
1745         hash_table = (struct buffer_head **) __get_free_pages(GFP_ATOMIC, order);
1746
1747         if (!hash_table)
1748                 panic("Failed to allocate buffer hash table\n");
1749         memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1750         bh_hash_mask = nr_hash-1;
1751
1752         bh_cachep = kmem_cache_create("buffer_head",
1753                                       sizeof(struct buffer_head),
1754                                       0,
1755                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1756         if(!bh_cachep)
1757                 panic("Cannot create buffer head SLAB cache\n");
1758         /*
1759          * Allocate the reserved buffer heads.
1760          */
1761         while (nr_buffer_heads < NR_RESERVED) {
1762                 struct buffer_head * bh;
1763
1764                 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1765                 if (!bh)
1766                         break;
1767                 put_unused_buffer_head(bh);
1768                 nr_buffer_heads++;
1769         }
1770
1771         lru_list[BUF_CLEAN] = 0;
1772         grow_buffers(GFP_KERNEL, BLOCK_SIZE);
1773 }
1774
1775
1776 /* ====================== bdflush support =================== */
1777
1778 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1779  * response to dirty buffers.  Once this process is activated, we write back
1780  * a limited number of buffers to the disks and then go back to sleep again.
1781  */
1782 static struct wait_queue * bdflush_wait = NULL;
1783 static struct wait_queue * bdflush_done = NULL;
1784 struct task_struct *bdflush_tsk = 0;
1785
1786 void wakeup_bdflush(int wait)
1787 {
1788         if (current == bdflush_tsk)
1789                 return;
1790         wake_up(&bdflush_wait);
1791         if (wait) {
1792                 run_task_queue(&tq_disk);
1793                 sleep_on(&bdflush_done);
1794         }
1795 }
1796
1797
1798 /*
1799  * Here we attempt to write back old buffers.  We also try to flush inodes
1800  * and supers as well, since this function is essentially "update", and
1801  * otherwise there would be no way of ensuring that these quantities ever
1802  * get written back.  Ideally, we would have a timestamp on the inodes
1803  * and superblocks so that we could write back only the old ones as well
1804  */
1805
1806 asmlinkage int sync_old_buffers(void)
1807 {
1808         int i;
1809         int ndirty, nwritten;
1810         int nlist;
1811         int ncount;
1812         struct buffer_head * bh, *next;
1813
1814         sync_supers(0);
1815         sync_inodes(0);
1816
1817         ncount = 0;
1818 #ifdef DEBUG
1819         for(nlist = 0; nlist < NR_LIST; nlist++)
1820 #else
1821         for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1822 #endif
1823         {
1824                 ndirty = 0;
1825                 nwritten = 0;
1826         repeat:
1827
1828                 bh = lru_list[nlist];
1829                 if(bh)
1830                          for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1831                                  /* We may have stalled while waiting for I/O to complete. */
1832                                  if(bh->b_list != nlist) goto repeat;
1833                                  next = bh->b_next_free;
1834                                  if(!lru_list[nlist]) {
1835                                          printk("Dirty list empty %d\n", i);
1836                                          break;
1837                                  }
1838
1839                                  /* Clean buffer on dirty list?  Refile it */
1840                                  if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1841                                   {
1842                                           refile_buffer(bh);
1843                                           continue;
1844                                   }
1845
1846                                  if (buffer_locked(bh) || !buffer_dirty(bh))
1847                                           continue;
1848                                  ndirty++;
1849                                  if(bh->b_flushtime > jiffies) continue;
1850                                  nwritten++;
1851                                  next->b_count++;
1852                                  bh->b_count++;
1853                                  bh->b_flushtime = 0;
1854 #ifdef DEBUG
1855                                  if(nlist != BUF_DIRTY) ncount++;
1856 #endif
1857                                  ll_rw_block(WRITE, 1, &bh);
1858                                  bh->b_count--;
1859                                  next->b_count--;
1860                          }
1861         }
1862         run_task_queue(&tq_disk);
1863 #ifdef DEBUG
1864         if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1865         printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1866 #endif
1867         run_task_queue(&tq_disk);
1868         return 0;
1869 }
1870
1871
1872 /* This is the interface to bdflush.  As we get more sophisticated, we can
1873  * pass tuning parameters to this "process", to adjust how it behaves.
1874  * We would want to verify each parameter, however, to make sure that it
1875  * is reasonable. */
1876
1877 asmlinkage int sys_bdflush(int func, long data)
1878 {
1879         int i, error = -EPERM;
1880
1881         lock_kernel();
1882         if (!capable(CAP_SYS_ADMIN))
1883                 goto out;
1884
1885         if (func == 1) {
1886                  error = sync_old_buffers();
1887                  goto out;
1888         }
1889
1890         /* Basically func 1 means read param 1, 2 means write param 1, etc */
1891         if (func >= 2) {
1892                 i = (func-2) >> 1;
1893                 error = -EINVAL;
1894                 if (i < 0 || i >= N_PARAM)
1895                         goto out;
1896                 if((func & 1) == 0) {
1897                         error = put_user(bdf_prm.data[i], (int*)data);
1898                         goto out;
1899                 }
1900                 if (data < bdflush_min[i] || data > bdflush_max[i])
1901                         goto out;
1902                 bdf_prm.data[i] = data;
1903                 error = 0;
1904                 goto out;
1905         };
1906
1907         /* Having func 0 used to launch the actual bdflush and then never
1908          * return (unless explicitly killed). We return zero here to
1909          * remain semi-compatible with present update(8) programs.
1910          */
1911         error = 0;
1912 out:
1913         unlock_kernel();
1914         return error;
1915 }
1916
1917 /* This is the actual bdflush daemon itself. It used to be started from
1918  * the syscall above, but now we launch it ourselves internally with
1919  * kernel_thread(...)  directly after the first thread in init/main.c */
1920
1921 /* To prevent deadlocks for a loop device:
1922  * 1) Do non-blocking writes to loop (avoids deadlock with running
1923  *      out of request blocks).
1924  * 2) But do a blocking write if the only dirty buffers are loop buffers
1925  *      (otherwise we go into an infinite busy-loop).
1926  * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1927  *      with running out of free buffers for loop's "real" device).
1928 */
1929 int bdflush(void * unused)
1930 {
1931         int i;
1932         int ndirty;
1933         int nlist;
1934         int ncount;
1935         struct buffer_head * bh, *next;
1936         int major;
1937         int wrta_cmd = WRITEA;  /* non-blocking write for LOOP */
1938
1939         /*
1940          *      We have a bare-bones task_struct, and really should fill
1941          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
1942          *      display semi-sane things. Not real crucial though...
1943          */
1944
1945         current->session = 1;
1946         current->pgrp = 1;
1947         sprintf(current->comm, "kflushd");
1948         bdflush_tsk = current;
1949
1950         /*
1951          *      As a kernel thread we want to tamper with system buffers
1952          *      and other internals and thus be subject to the SMP locking
1953          *      rules. (On a uniprocessor box this does nothing).
1954          */
1955         lock_kernel();
1956
1957         for (;;) {
1958 #ifdef DEBUG
1959                 printk("bdflush() activated...");
1960 #endif
1961
1962                 CHECK_EMERGENCY_SYNC
1963
1964                 ncount = 0;
1965 #ifdef DEBUG
1966                 for(nlist = 0; nlist < NR_LIST; nlist++)
1967 #else
1968                 for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1969 #endif
1970                  {
1971                          ndirty = 0;
1972                          refilled = 0;
1973                  repeat:
1974
1975                          bh = lru_list[nlist];
1976                          if(bh)
1977                                   for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1978                                        bh = next) {
1979                                           /* We may have stalled while waiting for I/O to complete. */
1980                                           if(bh->b_list != nlist) goto repeat;
1981                                           next = bh->b_next_free;
1982                                           if(!lru_list[nlist]) {
1983                                                   printk("Dirty list empty %d\n", i);
1984                                                   break;
1985                                           }
1986
1987                                           /* Clean buffer on dirty list?  Refile it */
1988                                           if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1989                                            {
1990                                                    refile_buffer(bh);
1991                                                    continue;
1992                                            }
1993
1994                                           if (buffer_locked(bh) || !buffer_dirty(bh))
1995                                                    continue;
1996                                           major = MAJOR(bh->b_dev);
1997                                           /* Should we write back buffers that are shared or not??
1998                                              currently dirty buffers are not shared, so it does not matter */
1999                                           if (refilled && major == LOOP_MAJOR)
2000                                                    continue;
2001                                           next->b_count++;
2002                                           bh->b_count++;
2003                                           ndirty++;
2004                                           bh->b_flushtime = 0;
2005                                           if (major == LOOP_MAJOR) {
2006                                                   ll_rw_block(wrta_cmd,1, &bh);
2007                                                   wrta_cmd = WRITEA;
2008                                                   if (buffer_dirty(bh))
2009                                                           --ndirty;
2010                                           }
2011                                           else
2012                                           ll_rw_block(WRITE, 1, &bh);
2013 #ifdef DEBUG
2014                                           if(nlist != BUF_DIRTY) ncount++;
2015 #endif
2016                                           bh->b_count--;
2017                                           next->b_count--;
2018                                   }
2019                  }
2020 #ifdef DEBUG
2021                 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
2022                 printk("sleeping again.\n");
2023 #endif
2024                 /* If we didn't write anything, but there are still
2025                  * dirty buffers, then make the next write to a
2026                  * loop device to be a blocking write.
2027                  * This lets us block--which we _must_ do! */
2028                 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
2029                         wrta_cmd = WRITE;
2030                         continue;
2031                 }
2032                 run_task_queue(&tq_disk);
2033                 wake_up(&bdflush_done);
2034
2035                 /* If there are still a lot of dirty buffers around, skip the sleep
2036                    and flush some more */
2037                 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
2038                         spin_lock_irq(&current->sigmask_lock);
2039                         flush_signals(current);
2040                         spin_unlock_irq(&current->sigmask_lock);
2041
2042                         interruptible_sleep_on(&bdflush_wait);
2043                 }
2044         }
2045 }