fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Some bdflush() changes for the dynamic ramdisk - Paul Gortmaker, 12/94 */
  14 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  15
  16 /* Removed a lot of unnecessary code and simplified things now that
  17  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  18  */
  19
  20 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  21  * hash table, use SLAB cache for buffer heads. -DaveM
  22  */
  23
  24 #include <linux/sched.h>
  25 #include <linux/kernel.h>
  26 #include <linux/major.h>
  27 #include <linux/string.h>
  28 #include <linux/locks.h>
  29 #include <linux/errno.h>
  30 #include <linux/malloc.h>
  31 #include <linux/slab.h>
  32 #include <linux/pagemap.h>
  33 #include <linux/swap.h>
  34 #include <linux/swapctl.h>
  35 #include <linux/smp.h>
  36 #include <linux/smp_lock.h>
  37 #include <linux/vmalloc.h>
  38 #include <linux/blkdev.h>
  39 #include <linux/sysrq.h>
  40 #include <linux/file.h>
  41
  42 #include <asm/system.h>
  43 #include <asm/uaccess.h>
  44 #include <asm/io.h>
  45 #include <asm/bitops.h>
  46
  47 #define NR_SIZES 5
  48 static char buffersize_index[17] =
  49 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4};
  50
  51 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  52 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  53 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  54 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  55                                              number of unused buffer heads */
  56
  57 /*
  58  * How large a hash table do we need?
  59  */
  60 #define HASH_PAGES_ORDER        4
  61 #define HASH_PAGES              (1UL << HASH_PAGES_ORDER)
  62 #define NR_HASH                 (HASH_PAGES*PAGE_SIZE/sizeof(struct buffer_head *))
  63 #define HASH_MASK               (NR_HASH-1)
  64
  65 static int grow_buffers(int pri, int size);
  66
  67 static struct buffer_head ** hash_table;
  68 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
  69 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
  70
  71 static kmem_cache_t *bh_cachep;
  72
  73 static struct buffer_head * unused_list = NULL;
  74 static struct buffer_head * reuse_list = NULL;
  75 static struct wait_queue * buffer_wait = NULL;
  76
  77 static int nr_buffers = 0;
  78 static int nr_buffers_type[NR_LIST] = {0,};
  79 static int nr_buffer_heads = 0;
  80 static int nr_unused_buffer_heads = 0;
  81 static int refilled = 0;       /* Set NZ when a buffer freelist is refilled
  82                                   this is used by the loop device */
  83
  84 /* This is used by some architectures to estimate available memory. */
  85 int buffermem = 0;
  86
  87 /* Here is the parameter block for the bdflush process. If you add or
  88  * remove any of the parameters, make sure to update kernel/sysctl.c.
  89  */
  90
  91 #define N_PARAM 9
  92
  93 /* The dummy values in this structure are left in there for compatibility
  94  * with old programs that play with the /proc entries.
  95  */
  96 union bdflush_param{
  97         struct {
  98                 int nfract;  /* Percentage of buffer cache dirty to
  99                                 activate bdflush */
 100                 int ndirty;  /* Maximum number of dirty blocks to write out per
 101                                 wake-cycle */
 102                 int nrefill; /* Number of clean buffers to try to obtain
 103                                 each time we call refill */
 104                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 105                                   when trying to refill buffers. */
 106                 int dummy1;    /* unused */
 107                 int age_buffer;  /* Time for normal buffer to age before
 108                                     we flush it */
 109                 int age_super;  /* Time for superblock to age before we
 110                                    flush it */
 111                 int dummy2;    /* unused */
 112                 int dummy3;    /* unused */
 113         } b_un;
 114         unsigned int data[N_PARAM];
 115 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
 116
 117 /* These are the min and max parameter values that we will allow to be assigned */
 118 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   100,   100, 1, 1};
 119 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 60000, 60000, 2047, 5};
 120
 121 void wakeup_bdflush(int);
 122
 123 /*
 124  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 125  * and getting rid of the cli-sti pairs. The wait-queue routines still
 126  * need cli-sti, but now it's just a couple of 386 instructions or so.
 127  *
 128  * Note that the real wait_on_buffer() is an inline function that checks
 129  * if 'b_wait' is set before calling this, so that the queues aren't set
 130  * up unnecessarily.
 131  */
 132 void __wait_on_buffer(struct buffer_head * bh)
 133 {
 134         struct task_struct *tsk = current;
 135         struct wait_queue wait;
 136
 137         bh->b_count++;
 138         wait.task = tsk;
 139         add_wait_queue(&bh->b_wait, &wait);
 140 repeat:
 141         tsk->state = TASK_UNINTERRUPTIBLE;
 142         run_task_queue(&tq_disk);
 143         if (buffer_locked(bh)) {
 144                 schedule();
 145                 goto repeat;
 146         }
 147         tsk->state = TASK_RUNNING;
 148         remove_wait_queue(&bh->b_wait, &wait);
 149         bh->b_count--;
 150 }
 151
 152 /* Call sync_buffers with wait!=0 to ensure that the call does not
 153  * return until all buffer writes have completed.  Sync() may return
 154  * before the writes have finished; fsync() may not.
 155  */
 156
 157 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 158  * spontaneously dirty themselves without ever brelse being called.
 159  * We will ultimately want to put these in a separate list, but for
 160  * now we search all of the lists for dirty buffers.
 161  */
 162 static int sync_buffers(kdev_t dev, int wait)
 163 {
 164         int i, retry, pass = 0, err = 0;
 165         struct buffer_head * bh, *next;
 166
 167         /* One pass for no-wait, three for wait:
 168          * 0) write out all dirty, unlocked buffers;
 169          * 1) write out all dirty buffers, waiting if locked;
 170          * 2) wait for completion by waiting for all buffers to unlock.
 171          */
 172         do {
 173                 retry = 0;
 174 repeat:
 175                 /* We search all lists as a failsafe mechanism, not because we expect
 176                  * there to be dirty buffers on any of the other lists.
 177                  */
 178                 bh = lru_list[BUF_DIRTY];
 179                 if (!bh)
 180                         goto repeat2;
 181                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 182                         if (bh->b_list != BUF_DIRTY)
 183                                 goto repeat;
 184                         next = bh->b_next_free;
 185                         if (!lru_list[BUF_DIRTY])
 186                                 break;
 187                         if (dev && bh->b_dev != dev)
 188                                 continue;
 189                         if (buffer_locked(bh)) {
 190                                 /* Buffer is locked; skip it unless wait is
 191                                  * requested AND pass > 0.
 192                                  */
 193                                 if (!wait || !pass) {
 194                                         retry = 1;
 195                                         continue;
 196                                 }
 197                                 wait_on_buffer (bh);
 198                                 goto repeat;
 199                         }
 200
 201                         /* If an unlocked buffer is not uptodate, there has
 202                          * been an IO error. Skip it.
 203                          */
 204                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 205                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 206                                 err = -EIO;
 207                                 continue;
 208                         }
 209
 210                         /* Don't write clean buffers.  Don't write ANY buffers
 211                          * on the third pass.
 212                          */
 213                         if (!buffer_dirty(bh) || pass >= 2)
 214                                 continue;
 215
 216                         /* Don't bother about locked buffers.
 217                          *
 218                          * XXX We checked if it was locked above and there is no
 219                          * XXX way we could have slept in between. -DaveM
 220                          */
 221                         if (buffer_locked(bh))
 222                                 continue;
 223                         bh->b_count++;
 224                         next->b_count++;
 225                         bh->b_flushtime = 0;
 226                         ll_rw_block(WRITE, 1, &bh);
 227                         bh->b_count--;
 228                         next->b_count--;
 229                         retry = 1;
 230                 }
 231
 232     repeat2:
 233                 bh = lru_list[BUF_LOCKED];
 234                 if (!bh)
 235                         break;
 236                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 237                         if (bh->b_list != BUF_LOCKED)
 238                                 goto repeat2;
 239                         next = bh->b_next_free;
 240                         if (!lru_list[BUF_LOCKED])
 241                                 break;
 242                         if (dev && bh->b_dev != dev)
 243                                 continue;
 244                         if (buffer_locked(bh)) {
 245                                 /* Buffer is locked; skip it unless wait is
 246                                  * requested AND pass > 0.
 247                                  */
 248                                 if (!wait || !pass) {
 249                                         retry = 1;
 250                                         continue;
 251                                 }
 252                                 wait_on_buffer (bh);
 253                                 goto repeat2;
 254                         }
 255                 }
 256
 257                 /* If we are waiting for the sync to succeed, and if any dirty
 258                  * blocks were written, then repeat; on the second pass, only
 259                  * wait for buffers being written (do not pass to write any
 260                  * more buffers on the second pass).
 261                  */
 262         } while (wait && retry && ++pass<=2);
 263         return err;
 264 }
 265
 266 void sync_dev(kdev_t dev)
 267 {
 268         sync_buffers(dev, 0);
 269         sync_supers(dev);
 270         sync_inodes(dev);
 271         sync_buffers(dev, 0);
 272         sync_dquots(dev, -1);
 273         /*
 274          * FIXME(eric) we need to sync the physical devices here.
 275          * This is because some (scsi) controllers have huge amounts of
 276          * cache onboard (hundreds of Mb), and we need to instruct
 277          * them to commit all of the dirty memory to disk, and we should
 278          * not return until this has happened.
 279          *
 280          * This would need to get implemented by going through the assorted
 281          * layers so that each block major number can be synced, and this
 282          * would call down into the upper and mid-layer scsi.
 283          */
 284 }
 285
 286 int fsync_dev(kdev_t dev)
 287 {
 288         sync_buffers(dev, 0);
 289         sync_supers(dev);
 290         sync_inodes(dev);
 291         sync_dquots(dev, -1);
 292         return sync_buffers(dev, 1);
 293 }
 294
 295 asmlinkage int sys_sync(void)
 296 {
 297         lock_kernel();
 298         fsync_dev(0);
 299         unlock_kernel();
 300         return 0;
 301 }
 302
 303 /*
 304  *      filp may be NULL if called via the msync of a vma.
 305  */
 306
 307 int file_fsync(struct file *filp, struct dentry *dentry)
 308 {
 309         struct inode * inode = dentry->d_inode;
 310         struct super_block * sb;
 311         kdev_t dev;
 312
 313         /* sync the inode to buffers */
 314         write_inode_now(inode);
 315
 316         /* sync the superblock to buffers */
 317         sb = inode->i_sb;
 318         wait_on_super(sb);
 319         if (sb->s_op && sb->s_op->write_super)
 320                 sb->s_op->write_super(sb);
 321
 322         /* .. finally sync the buffers to disk */
 323         dev = inode->i_dev;
 324         return sync_buffers(dev, 1);
 325 }
 326
 327 asmlinkage int sys_fsync(unsigned int fd)
 328 {
 329         struct file * file;
 330         struct dentry * dentry;
 331         struct inode * inode;
 332         int err;
 333
 334         lock_kernel();
 335         err = -EBADF;
 336         file = fget(fd);
 337         if (!file)
 338                 goto out;
 339
 340         dentry = file->f_dentry;
 341         if (!dentry)
 342                 goto out_putf;
 343
 344         inode = dentry->d_inode;
 345         if (!inode)
 346                 goto out_putf;
 347
 348         err = -EINVAL;
 349         if (!file->f_op || !file->f_op->fsync)
 350                 goto out_putf;
 351
 352         /* We need to protect against concurrent writers.. */
 353         down(&inode->i_sem);
 354         err = file->f_op->fsync(file, dentry);
 355         up(&inode->i_sem);
 356
 357 out_putf:
 358         fput(file);
 359 out:
 360         unlock_kernel();
 361         return err;
 362 }
 363
 364 asmlinkage int sys_fdatasync(unsigned int fd)
 365 {
 366         struct file * file;
 367         struct dentry * dentry;
 368         struct inode * inode;
 369         int err;
 370
 371         lock_kernel();
 372         err = -EBADF;
 373         file = fget(fd);
 374         if (!file)
 375                 goto out;
 376
 377         dentry = file->f_dentry;
 378         if (!dentry)
 379                 goto out_putf;
 380
 381         inode = dentry->d_inode;
 382         if (!inode)
 383                 goto out_putf;
 384
 385         err = -EINVAL;
 386         if (!file->f_op || !file->f_op->fsync)
 387                 goto out_putf;
 388
 389         /* this needs further work, at the moment it is identical to fsync() */
 390         err = file->f_op->fsync(file, dentry);
 391
 392 out_putf:
 393         fput(file);
 394 out:
 395         unlock_kernel();
 396         return err;
 397 }
 398
 399 void invalidate_buffers(kdev_t dev)
 400 {
 401         int i;
 402         int nlist;
 403         struct buffer_head * bh;
 404
 405         for(nlist = 0; nlist < NR_LIST; nlist++) {
 406                 bh = lru_list[nlist];
 407                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 408                         if (bh->b_dev != dev)
 409                                 continue;
 410                         wait_on_buffer(bh);
 411                         if (bh->b_dev != dev)
 412                                 continue;
 413                         if (bh->b_count)
 414                                 continue;
 415                         bh->b_flushtime = 0;
 416                         clear_bit(BH_Protected, &bh->b_state);
 417                         clear_bit(BH_Uptodate, &bh->b_state);
 418                         clear_bit(BH_Dirty, &bh->b_state);
 419                         clear_bit(BH_Req, &bh->b_state);
 420                 }
 421         }
 422 }
 423
 424 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block))&HASH_MASK)
 425 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 426
 427 static inline void remove_from_hash_queue(struct buffer_head * bh)
 428 {
 429         if (bh->b_pprev) {
 430                 if(bh->b_next)
 431                         bh->b_next->b_pprev = bh->b_pprev;
 432                 *bh->b_pprev = bh->b_next;
 433                 bh->b_pprev = NULL;
 434         }
 435 }
 436
 437 static inline void remove_from_lru_list(struct buffer_head * bh)
 438 {
 439         if (!(bh->b_prev_free) || !(bh->b_next_free))
 440                 panic("VFS: LRU block list corrupted");
 441         if (bh->b_dev == B_FREE)
 442                 panic("LRU list corrupted");
 443         bh->b_prev_free->b_next_free = bh->b_next_free;
 444         bh->b_next_free->b_prev_free = bh->b_prev_free;
 445
 446         if (lru_list[bh->b_list] == bh)
 447                  lru_list[bh->b_list] = bh->b_next_free;
 448         if (lru_list[bh->b_list] == bh)
 449                  lru_list[bh->b_list] = NULL;
 450         bh->b_next_free = bh->b_prev_free = NULL;
 451 }
 452
 453 static inline void remove_from_free_list(struct buffer_head * bh)
 454 {
 455         int isize = BUFSIZE_INDEX(bh->b_size);
 456         if (!(bh->b_prev_free) || !(bh->b_next_free))
 457                 panic("VFS: Free block list corrupted");
 458         if(bh->b_dev != B_FREE)
 459                 panic("Free list corrupted");
 460         if(!free_list[isize])
 461                 panic("Free list empty");
 462         if(bh->b_next_free == bh)
 463                  free_list[isize] = NULL;
 464         else {
 465                 bh->b_prev_free->b_next_free = bh->b_next_free;
 466                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 467                 if (free_list[isize] == bh)
 468                          free_list[isize] = bh->b_next_free;
 469         }
 470         bh->b_next_free = bh->b_prev_free = NULL;
 471 }
 472
 473 static inline void remove_from_queues(struct buffer_head * bh)
 474 {
 475         if(bh->b_dev == B_FREE) {
 476                 remove_from_free_list(bh); /* Free list entries should not be
 477                                               in the hash queue */
 478                 return;
 479         }
 480         nr_buffers_type[bh->b_list]--;
 481         remove_from_hash_queue(bh);
 482         remove_from_lru_list(bh);
 483 }
 484
 485 static inline void put_last_lru(struct buffer_head * bh)
 486 {
 487         if (bh) {
 488                 struct buffer_head **bhp = &lru_list[bh->b_list];
 489
 490                 if (bh == *bhp) {
 491                         *bhp = bh->b_next_free;
 492                         return;
 493                 }
 494
 495                 if(bh->b_dev == B_FREE)
 496                         panic("Wrong block for lru list");
 497
 498                 /* Add to back of free list. */
 499                 remove_from_lru_list(bh);
 500                 if(!*bhp) {
 501                         *bhp = bh;
 502                         (*bhp)->b_prev_free = bh;
 503                 }
 504
 505                 bh->b_next_free = *bhp;
 506                 bh->b_prev_free = (*bhp)->b_prev_free;
 507                 (*bhp)->b_prev_free->b_next_free = bh;
 508                 (*bhp)->b_prev_free = bh;
 509         }
 510 }
 511
 512 static inline void put_last_free(struct buffer_head * bh)
 513 {
 514         if (bh) {
 515                 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 516
 517                 bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 518
 519                 /* Add to back of free list. */
 520                 if(!*bhp) {
 521                         *bhp = bh;
 522                         bh->b_prev_free = bh;
 523                 }
 524
 525                 bh->b_next_free = *bhp;
 526                 bh->b_prev_free = (*bhp)->b_prev_free;
 527                 (*bhp)->b_prev_free->b_next_free = bh;
 528                 (*bhp)->b_prev_free = bh;
 529         }
 530 }
 531
 532 static inline void insert_into_queues(struct buffer_head * bh)
 533 {
 534         /* put at end of free list */
 535         if(bh->b_dev == B_FREE) {
 536                 put_last_free(bh);
 537         } else {
 538                 struct buffer_head **bhp = &lru_list[bh->b_list];
 539
 540                 if(!*bhp) {
 541                         *bhp = bh;
 542                         bh->b_prev_free = bh;
 543                 }
 544
 545                 if (bh->b_next_free)
 546                         panic("VFS: buffer LRU pointers corrupted");
 547
 548                 bh->b_next_free = *bhp;
 549                 bh->b_prev_free = (*bhp)->b_prev_free;
 550                 (*bhp)->b_prev_free->b_next_free = bh;
 551                 (*bhp)->b_prev_free = bh;
 552
 553                 nr_buffers_type[bh->b_list]++;
 554
 555                 /* Put the buffer in new hash-queue if it has a device. */
 556                 if (bh->b_dev) {
 557                         struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
 558                         if((bh->b_next = *bhp) != NULL)
 559                                 (*bhp)->b_pprev = &bh->b_next;
 560                         *bhp = bh;
 561                         bh->b_pprev = bhp;      /* Exists in bh hashes. */
 562                 } else
 563                         bh->b_pprev = NULL;     /* Not in bh hashes. */
 564         }
 565 }
 566
 567 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 568 {
 569         struct buffer_head * next;
 570
 571         next = hash(dev,block);
 572         for (;;) {
 573                 struct buffer_head *tmp = next;
 574                 if (!next)
 575                         break;
 576                 next = tmp->b_next;
 577                 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
 578                         continue;
 579                 next = tmp;
 580                 break;
 581         }
 582         return next;
 583 }
 584
 585 /*
 586  * Why like this, I hear you say... The reason is race-conditions.
 587  * As we don't lock buffers (unless we are reading them, that is),
 588  * something might happen to it while we sleep (ie a read-error
 589  * will force it bad). This shouldn't really happen currently, but
 590  * the code is ready.
 591  */
 592 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 593 {
 594         struct buffer_head * bh;
 595         for (;;) {
 596                 bh = find_buffer(dev,block,size);
 597                 if (!bh)
 598                         break;
 599                 bh->b_count++;
 600                 bh->b_lru_time = jiffies;
 601                 if (!buffer_locked(bh))
 602                         break;
 603                 __wait_on_buffer(bh);
 604                 if (bh->b_dev == dev            &&
 605                     bh->b_blocknr == block      &&
 606                     bh->b_size == size)
 607                         break;
 608                 bh->b_count--;
 609         }
 610         return bh;
 611 }
 612
 613 unsigned int get_hardblocksize(kdev_t dev)
 614 {
 615         /*
 616          * Get the hard sector size for the given device.  If we don't know
 617          * what it is, return 0.
 618          */
 619         if (hardsect_size[MAJOR(dev)] != NULL) {
 620                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 621                 if (blksize != 0)
 622                         return blksize;
 623         }
 624
 625         /*
 626          * We don't know what the hardware sector size for this device is.
 627          * Return 0 indicating that we don't know.
 628          */
 629         return 0;
 630 }
 631
 632 void set_blocksize(kdev_t dev, int size)
 633 {
 634         extern int *blksize_size[];
 635         int i, nlist;
 636         struct buffer_head * bh, *bhnext;
 637
 638         if (!blksize_size[MAJOR(dev)])
 639                 return;
 640
 641         if (size > PAGE_SIZE)
 642                 size = 0;
 643
 644         switch (size) {
 645                 default: panic("Invalid blocksize passed to set_blocksize");
 646                 case 512: case 1024: case 2048: case 4096: case 8192: ;
 647         }
 648
 649         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 650                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 651                 return;
 652         }
 653         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 654                 return;
 655         sync_buffers(dev, 2);
 656         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 657
 658         /* We need to be quite careful how we do this - we are moving entries
 659          * around on the free list, and we can get in a loop if we are not careful.
 660          */
 661         for(nlist = 0; nlist < NR_LIST; nlist++) {
 662                 bh = lru_list[nlist];
 663                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 664                         if(!bh)
 665                                 break;
 666
 667                         bhnext = bh->b_next_free;
 668                         if (bh->b_dev != dev)
 669                                  continue;
 670                         if (bh->b_size == size)
 671                                  continue;
 672                         bhnext->b_count++;
 673                         wait_on_buffer(bh);
 674                         bhnext->b_count--;
 675                         if (bh->b_dev == dev && bh->b_size != size) {
 676                                 clear_bit(BH_Dirty, &bh->b_state);
 677                                 clear_bit(BH_Uptodate, &bh->b_state);
 678                                 clear_bit(BH_Req, &bh->b_state);
 679                                 bh->b_flushtime = 0;
 680                         }
 681                         remove_from_hash_queue(bh);
 682                 }
 683         }
 684 }
 685
 686 /*
 687  * Find a candidate buffer to be reclaimed.
 688  * N.B. Must search the entire BUF_LOCKED list rather than terminating
 689  * when the first locked buffer is found.  Buffers are unlocked at
 690  * completion of IO, and under some conditions there may be (many)
 691  * unlocked buffers after the first locked one.
 692  */
 693 static struct buffer_head *find_candidate(struct buffer_head *bh,
 694                                           int *list_len, int size)
 695 {
 696         if (!bh)
 697                 goto no_candidate;
 698
 699         for (; (*list_len) > 0; bh = bh->b_next_free, (*list_len)--) {
 700                 if (size != bh->b_size) {
 701                         /* This provides a mechanism for freeing blocks
 702                          * of other sizes, this is necessary now that we
 703                          * no longer have the lav code.
 704                          */
 705                         try_to_free_buffer(bh,&bh,1);
 706                         if (!bh)
 707                                 break;
 708                         continue;
 709                 }
 710                 else if (!bh->b_count           &&
 711                          !buffer_locked(bh)     &&
 712                          !buffer_protected(bh)  &&
 713                          !buffer_dirty(bh))
 714                         return bh;
 715         }
 716
 717 no_candidate:
 718         return NULL;
 719 }
 720
 721 static void refill_freelist(int size)
 722 {
 723         struct buffer_head * bh, * next;
 724         struct buffer_head * candidate[BUF_DIRTY];
 725         int buffers[BUF_DIRTY];
 726         int i;
 727         int needed, obtained=0;
 728
 729         refilled = 1;
 730
 731         /* We are going to try to locate this much memory. */
 732         needed = bdf_prm.b_un.nrefill * size;
 733
 734         while ((nr_free_pages > min_free_pages*2) &&
 735                 grow_buffers(GFP_BUFFER, size)) {
 736                 obtained += PAGE_SIZE;
 737                 if (obtained >= needed)
 738                         return;
 739         }
 740
 741         /*
 742          * Update the needed amount based on the number of potentially
 743          * freeable buffers. We don't want to free more than one quarter
 744          * of the available buffers.
 745          */
 746         i = (nr_buffers_type[BUF_CLEAN] + nr_buffers_type[BUF_LOCKED]) >> 2;
 747         if (i < bdf_prm.b_un.nrefill) {
 748                 needed = i * size;
 749                 if (needed < PAGE_SIZE)
 750                         needed = PAGE_SIZE;
 751         }
 752
 753         /*
 754          * OK, we cannot grow the buffer cache, now try to get some
 755          * from the lru list.
 756          */
 757 repeat:
 758         if (obtained >= needed)
 759                 return;
 760
 761         /*
 762          * First set the candidate pointers to usable buffers.  This
 763          * should be quick nearly all of the time.  N.B. There must be
 764          * no blocking calls after setting up the candidate[] array!
 765          */
 766         for (i = BUF_CLEAN; i<BUF_DIRTY; i++) {
 767                 buffers[i] = nr_buffers_type[i];
 768                 candidate[i] = find_candidate(lru_list[i], &buffers[i], size);
 769         }
 770
 771         /*
 772          * Select the older of the available buffers until we reach our goal.
 773          */
 774         for (;;) {
 775                 i = BUF_CLEAN;
 776                 if (!candidate[BUF_CLEAN]) {
 777                         if (!candidate[BUF_LOCKED])
 778                                 break;
 779                         i = BUF_LOCKED;
 780                 }
 781                 else if (candidate[BUF_LOCKED] &&
 782                                 (candidate[BUF_LOCKED]->b_lru_time <
 783                                  candidate[BUF_CLEAN ]->b_lru_time))
 784                         i = BUF_LOCKED;
 785                 /*
 786                  * Free the selected buffer and get the next candidate.
 787                  */
 788                 bh = candidate[i];
 789                 next = bh->b_next_free;
 790
 791                 obtained += bh->b_size;
 792                 remove_from_queues(bh);
 793                 put_last_free(bh);
 794                 if (obtained >= needed)
 795                         return;
 796
 797                 if (--buffers[i] && bh != next)
 798                         candidate[i] = find_candidate(next, &buffers[i], size);
 799                 else
 800                         candidate[i] = NULL;
 801         }
 802
 803         /*
 804          * If there are dirty buffers, do a non-blocking wake-up.
 805          * This increases the chances of having buffers available
 806          * for the next call ...
 807          */
 808         if (nr_buffers_type[BUF_DIRTY])
 809                 wakeup_bdflush(0);
 810
 811         /*
 812          * Allocate buffers to reach half our goal, if possible.
 813          * Since the allocation doesn't block, there's no reason
 814          * to search the buffer lists again. Then return if there
 815          * are _any_ free buffers.
 816          */
 817         while (obtained < (needed >> 1) &&
 818                nr_free_pages > min_free_pages + 5 &&
 819                grow_buffers(GFP_BUFFER, size))
 820                 obtained += PAGE_SIZE;
 821
 822         if (free_list[BUFSIZE_INDEX(size)])
 823                 return;
 824
 825         /*
 826          * If there are dirty buffers, wait while bdflush writes
 827          * them out. The buffers become locked, but we can just
 828          * wait for one to unlock ...
 829          */
 830         if (nr_buffers_type[BUF_DIRTY])
 831                 wakeup_bdflush(1);
 832
 833         /*
 834          * In order to prevent a buffer shortage from exhausting
 835          * the system's reserved pages, we force tasks to wait
 836          * before using reserved pages for buffers.  This is easily
 837          * accomplished by waiting on an unused locked buffer.
 838          */
 839         if ((bh = lru_list[BUF_LOCKED]) != NULL) {
 840                 for (i = nr_buffers_type[BUF_LOCKED]; i--; bh = bh->b_next_free)
 841                 {
 842                         if (bh->b_size != size)
 843                                 continue;
 844                         if (bh->b_count)
 845                                 continue;
 846                         if (!buffer_locked(bh))
 847                                 continue;
 848                         if (buffer_dirty(bh) || buffer_protected(bh))
 849                                 continue;
 850                         if (MAJOR(bh->b_dev) == LOOP_MAJOR)
 851                                 continue;
 852                         /*
 853                          * We've found an unused, locked, non-dirty buffer of
 854                          * the correct size.  Claim it so no one else can,
 855                          * then wait for it to unlock.
 856                          */
 857                         bh->b_count++;
 858                         wait_on_buffer(bh);
 859                         bh->b_count--;
 860                         /*
 861                          * Loop back to harvest this (and maybe other) buffers.
 862                          */
 863                         goto repeat;
 864                 }
 865         }
 866
 867         /*
 868          * Convert a reserved page into buffers ... should happen only rarely.
 869          */
 870         if (nr_free_pages > (min_free_pages >> 1) &&
 871             grow_buffers(GFP_ATOMIC, size)) {
 872 #ifdef BUFFER_DEBUG
 873 printk("refill_freelist: used reserve page\n");
 874 #endif
 875                 return;
 876         }
 877
 878         /*
 879          * System is _very_ low on memory ... sleep and try later.
 880          */
 881 #ifdef BUFFER_DEBUG
 882 printk("refill_freelist: task %s waiting for buffers\n", current->comm);
 883 #endif
 884         schedule();
 885         goto repeat;
 886 }
 887
 888 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 889                  bh_end_io_t *handler, void *dev_id)
 890 {
 891         bh->b_count = 1;
 892         bh->b_list = BUF_CLEAN;
 893         bh->b_flushtime = 0;
 894         bh->b_dev = dev;
 895         bh->b_blocknr = block;
 896         bh->b_end_io = handler;
 897         bh->b_dev_id = dev_id;
 898 }
 899
 900 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 901 {
 902         mark_buffer_uptodate(bh, uptodate);
 903         unlock_buffer(bh);
 904 }
 905
 906 /*
 907  * Ok, this is getblk, and it isn't very clear, again to hinder
 908  * race-conditions. Most of the code is seldom used, (ie repeating),
 909  * so it should be much more efficient than it looks.
 910  *
 911  * The algorithm is changed: hopefully better, and an elusive bug removed.
 912  *
 913  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 914  * when the filesystem starts to get full of dirty blocks (I hope).
 915  */
 916 struct buffer_head * getblk(kdev_t dev, int block, int size)
 917 {
 918         struct buffer_head * bh;
 919         int isize;
 920
 921 repeat:
 922         bh = get_hash_table(dev, block, size);
 923         if (bh) {
 924                 if (!buffer_dirty(bh)) {
 925                         if (buffer_uptodate(bh))
 926                                  put_last_lru(bh);
 927                         bh->b_flushtime = 0;
 928                 }
 929                 set_bit(BH_Touched, &bh->b_state);
 930                 return bh;
 931         }
 932
 933         isize = BUFSIZE_INDEX(size);
 934 get_free:
 935         bh = free_list[isize];
 936         if (!bh)
 937                 goto refill;
 938         remove_from_free_list(bh);
 939
 940         /* OK, FINALLY we know that this buffer is the only one of its kind,
 941          * and that it's unused (b_count=0), unlocked, and clean.
 942          */
 943         init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
 944         bh->b_lru_time  = jiffies;
 945         bh->b_state=(1<<BH_Touched);
 946         insert_into_queues(bh);
 947         return bh;
 948
 949         /*
 950          * If we block while refilling the free list, somebody may
 951          * create the buffer first ... search the hashes again.
 952          */
 953 refill:
 954         refill_freelist(size);
 955         if (!find_buffer(dev,block,size))
 956                 goto get_free;
 957         goto repeat;
 958 }
 959
 960 void set_writetime(struct buffer_head * buf, int flag)
 961 {
 962         int newtime;
 963
 964         if (buffer_dirty(buf)) {
 965                 /* Move buffer to dirty list if jiffies is clear. */
 966                 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
 967                                      bdf_prm.b_un.age_buffer);
 968                 if(!buf->b_flushtime || buf->b_flushtime > newtime)
 969                          buf->b_flushtime = newtime;
 970         } else {
 971                 buf->b_flushtime = 0;
 972         }
 973 }
 974
 975
 976 /*
 977  * Put a buffer into the appropriate list, without side-effects.
 978  */
 979 static inline void file_buffer(struct buffer_head *bh, int list)
 980 {
 981         remove_from_queues(bh);
 982         bh->b_list = list;
 983         insert_into_queues(bh);
 984 }
 985
 986 /*
 987  * A buffer may need to be moved from one buffer list to another
 988  * (e.g. in case it is not shared any more). Handle this.
 989  */
 990 void refile_buffer(struct buffer_head * buf)
 991 {
 992         int dispose;
 993
 994         if(buf->b_dev == B_FREE) {
 995                 printk("Attempt to refile free buffer\n");
 996                 return;
 997         }
 998         if (buffer_dirty(buf))
 999                 dispose = BUF_DIRTY;
1000         else if (buffer_locked(buf))
1001                 dispose = BUF_LOCKED;
1002         else
1003                 dispose = BUF_CLEAN;
1004         if(dispose != buf->b_list) {
1005                 file_buffer(buf, dispose);
1006                 if(dispose == BUF_DIRTY) {
1007                         int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
1008
1009                         /* This buffer is dirty, maybe we need to start flushing.
1010                          * If too high a percentage of the buffers are dirty...
1011                          */
1012                         if (nr_buffers_type[BUF_DIRTY] > too_many)
1013                                 wakeup_bdflush(0);
1014
1015                         /* If this is a loop device, and
1016                          * more than half of the buffers are dirty...
1017                          * (Prevents no-free-buffers deadlock with loop device.)
1018                          */
1019                         if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
1020                             nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
1021                                 wakeup_bdflush(1);
1022                 }
1023         }
1024 }
1025
1026 /*
1027  * Release a buffer head
1028  */
1029 void __brelse(struct buffer_head * buf)
1030 {
1031         wait_on_buffer(buf);
1032
1033         /* If dirty, mark the time this buffer should be written back. */
1034         set_writetime(buf, 0);
1035         refile_buffer(buf);
1036
1037         if (buf->b_count) {
1038                 buf->b_count--;
1039                 return;
1040         }
1041         printk("VFS: brelse: Trying to free free buffer\n");
1042 }
1043
1044 /*
1045  * bforget() is like brelse(), except it removes the buffer
1046  * from the hash-queues (so that it won't be re-used if it's
1047  * shared).
1048  */
1049 void __bforget(struct buffer_head * buf)
1050 {
1051         wait_on_buffer(buf);
1052         mark_buffer_clean(buf);
1053         clear_bit(BH_Protected, &buf->b_state);
1054         buf->b_count--;
1055         remove_from_hash_queue(buf);
1056         buf->b_dev = NODEV;
1057         refile_buffer(buf);
1058 }
1059
1060 /*
1061  * bread() reads a specified block and returns the buffer that contains
1062  * it. It returns NULL if the block was unreadable.
1063  */
1064 struct buffer_head * bread(kdev_t dev, int block, int size)
1065 {
1066         struct buffer_head * bh;
1067
1068         if (!(bh = getblk(dev, block, size))) {
1069                 printk("VFS: bread: impossible error\n");
1070                 return NULL;
1071         }
1072         if (buffer_uptodate(bh))
1073                 return bh;
1074         ll_rw_block(READ, 1, &bh);
1075         wait_on_buffer(bh);
1076         if (buffer_uptodate(bh))
1077                 return bh;
1078         brelse(bh);
1079         return NULL;
1080 }
1081
1082 /*
1083  * Ok, breada can be used as bread, but additionally to mark other
1084  * blocks for reading as well. End the argument list with a negative
1085  * number.
1086  */
1087
1088 #define NBUF 16
1089
1090 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
1091         unsigned int pos, unsigned int filesize)
1092 {
1093         struct buffer_head * bhlist[NBUF];
1094         unsigned int blocks;
1095         struct buffer_head * bh;
1096         int index;
1097         int i, j;
1098
1099         if (pos >= filesize)
1100                 return NULL;
1101
1102         if (block < 0 || !(bh = getblk(dev,block,bufsize)))
1103                 return NULL;
1104
1105         index = BUFSIZE_INDEX(bh->b_size);
1106
1107         if (buffer_uptodate(bh))
1108                 return(bh);
1109         else ll_rw_block(READ, 1, &bh);
1110
1111         blocks = (filesize - pos) >> (9+index);
1112
1113         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1114                 blocks = read_ahead[MAJOR(dev)] >> index;
1115         if (blocks > NBUF)
1116                 blocks = NBUF;
1117
1118 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1119
1120
1121         bhlist[0] = bh;
1122         j = 1;
1123         for(i=1; i<blocks; i++) {
1124                 bh = getblk(dev,block+i,bufsize);
1125                 if (buffer_uptodate(bh)) {
1126                         brelse(bh);
1127                         break;
1128                 }
1129                 else bhlist[j++] = bh;
1130         }
1131
1132         /* Request the read for these buffers, and then release them. */
1133         if (j>1)
1134                 ll_rw_block(READA, (j-1), bhlist+1);
1135         for(i=1; i<j; i++)
1136                 brelse(bhlist[i]);
1137
1138         /* Wait for this buffer, and then continue on. */
1139         bh = bhlist[0];
1140         wait_on_buffer(bh);
1141         if (buffer_uptodate(bh))
1142                 return bh;
1143         brelse(bh);
1144         return NULL;
1145 }
1146
1147 static void put_unused_buffer_head(struct buffer_head * bh)
1148 {
1149         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1150                 nr_buffer_heads--;
1151                 kmem_cache_free(bh_cachep, bh);
1152                 return;
1153         }
1154
1155         memset(bh,0,sizeof(*bh));
1156         nr_unused_buffer_heads++;
1157         bh->b_next_free = unused_list;
1158         unused_list = bh;
1159         if (!waitqueue_active(&buffer_wait))
1160                 return;
1161         wake_up(&buffer_wait);
1162 }
1163
1164 /*
1165  * We can't put completed temporary IO buffer_heads directly onto the
1166  * unused_list when they become unlocked, since the device driver
1167  * end_request routines still expect access to the buffer_head's
1168  * fields after the final unlock.  So, the device driver puts them on
1169  * the reuse_list instead once IO completes, and we recover these to
1170  * the unused_list here.
1171  */
1172 static inline void recover_reusable_buffer_heads(void)
1173 {
1174         struct buffer_head *head;
1175
1176         head = xchg(&reuse_list, NULL);
1177
1178         while (head) {
1179                 struct buffer_head *bh = head;
1180                 head = head->b_next_free;
1181                 put_unused_buffer_head(bh);
1182         }
1183 }
1184
1185 /*
1186  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1187  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1188  * buffer heads is now handled in create_buffers().
1189  */
1190 static struct buffer_head * get_unused_buffer_head(int async)
1191 {
1192         struct buffer_head * bh;
1193
1194         recover_reusable_buffer_heads();
1195         if (nr_unused_buffer_heads > NR_RESERVED) {
1196                 bh = unused_list;
1197                 unused_list = bh->b_next_free;
1198                 nr_unused_buffer_heads--;
1199                 return bh;
1200         }
1201
1202         /* This is critical.  We can't swap out pages to get
1203          * more buffer heads, because the swap-out may need
1204          * more buffer-heads itself.  Thus SLAB_ATOMIC.
1205          */
1206         if((bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC)) != NULL) {
1207                 memset(bh, 0, sizeof(*bh));
1208                 nr_buffer_heads++;
1209                 return bh;
1210         }
1211
1212         /*
1213          * If we need an async buffer, use the reserved buffer heads.
1214          */
1215         if (async && unused_list) {
1216                 bh = unused_list;
1217                 unused_list = bh->b_next_free;
1218                 nr_unused_buffer_heads--;
1219                 return bh;
1220         }
1221
1222 #if 0
1223         /*
1224          * (Pending further analysis ...)
1225          * Ordinary (non-async) requests can use a different memory priority
1226          * to free up pages. Any swapping thus generated will use async
1227          * buffer heads.
1228          */
1229         if(!async &&
1230            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1231                 memset(bh, 0, sizeof(*bh));
1232                 nr_buffer_heads++;
1233                 return bh;
1234         }
1235 #endif
1236
1237         return NULL;
1238 }
1239
1240 /*
1241  * Create the appropriate buffers when given a page for data area and
1242  * the size of each buffer.. Use the bh->b_this_page linked list to
1243  * follow the buffers created.  Return NULL if unable to create more
1244  * buffers.
1245  * The async flag is used to differentiate async IO (paging, swapping)
1246  * from ordinary buffer allocations, and only async requests are allowed
1247  * to sleep waiting for buffer heads.
1248  */
1249 static struct buffer_head * create_buffers(unsigned long page,
1250                                                 unsigned long size, int async)
1251 {
1252         struct wait_queue wait = { current, NULL };
1253         struct buffer_head *bh, *head;
1254         long offset;
1255
1256 try_again:
1257         head = NULL;
1258         offset = PAGE_SIZE;
1259         while ((offset -= size) >= 0) {
1260                 bh = get_unused_buffer_head(async);
1261                 if (!bh)
1262                         goto no_grow;
1263
1264                 bh->b_dev = B_FREE;  /* Flag as unused */
1265                 bh->b_this_page = head;
1266                 head = bh;
1267
1268                 bh->b_state = 0;
1269                 bh->b_next_free = NULL;
1270                 bh->b_count = 0;
1271                 bh->b_size = size;
1272
1273                 bh->b_data = (char *) (page+offset);
1274                 bh->b_list = 0;
1275         }
1276         return head;
1277 /*
1278  * In case anything failed, we just free everything we got.
1279  */
1280 no_grow:
1281         bh = head;
1282         while (bh) {
1283                 head = bh;
1284                 bh = bh->b_this_page;
1285                 put_unused_buffer_head(head);
1286         }
1287
1288         /*
1289          * Return failure for non-async IO requests.  Async IO requests
1290          * are not allowed to fail, so we have to wait until buffer heads
1291          * become available.  But we don't want tasks sleeping with
1292          * partially complete buffers, so all were released above.
1293          */
1294         if (!async)
1295                 return NULL;
1296
1297         /* Uhhuh. We're _really_ low on memory. Now we just
1298          * wait for old buffer heads to become free due to
1299          * finishing IO.  Since this is an async request and
1300          * the reserve list is empty, we're sure there are
1301          * async buffer heads in use.
1302          */
1303         run_task_queue(&tq_disk);
1304
1305         /*
1306          * Set our state for sleeping, then check again for buffer heads.
1307          * This ensures we won't miss a wake_up from an interrupt.
1308          */
1309         add_wait_queue(&buffer_wait, &wait);
1310         current->state = TASK_UNINTERRUPTIBLE;
1311         recover_reusable_buffer_heads();
1312         schedule();
1313         remove_wait_queue(&buffer_wait, &wait);
1314         current->state = TASK_RUNNING;
1315         goto try_again;
1316 }
1317
1318 /* Run the hooks that have to be done when a page I/O has completed. */
1319 static inline void after_unlock_page (struct page * page)
1320 {
1321         if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1322                 atomic_dec(&nr_async_pages);
1323 #ifdef DEBUG_SWAP
1324                 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1325                         (char *) page_address(page),
1326                         atomic_read(&nr_async_pages));
1327 #endif
1328         }
1329         if (test_and_clear_bit(PG_free_after, &page->flags))
1330                 __free_page(page);
1331 }
1332
1333 /*
1334  * Free all temporary buffers belonging to a page.
1335  * This needs to be called with interrupts disabled.
1336  */
1337 static inline void free_async_buffers (struct buffer_head * bh)
1338 {
1339         struct buffer_head * tmp;
1340
1341         tmp = bh;
1342         do {
1343                 tmp->b_next_free = xchg(&reuse_list, NULL);
1344                 reuse_list = tmp;
1345                 tmp = tmp->b_this_page;
1346         } while (tmp != bh);
1347 }
1348
1349 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1350 {
1351         unsigned long flags;
1352         struct buffer_head *tmp;
1353         struct page *page;
1354
1355         mark_buffer_uptodate(bh, uptodate);
1356         unlock_buffer(bh);
1357
1358         /* This is a temporary buffer used for page I/O. */
1359         page = mem_map + MAP_NR(bh->b_data);
1360         if (!PageLocked(page))
1361                 goto not_locked;
1362         if (bh->b_count != 1)
1363                 goto bad_count;
1364
1365         if (!test_bit(BH_Uptodate, &bh->b_state))
1366                 set_bit(PG_error, &page->flags);
1367
1368         /*
1369          * Be _very_ careful from here on. Bad things can happen if
1370          * two buffer heads end IO at almost the same time and both
1371          * decide that the page is now completely done.
1372          *
1373          * Async buffer_heads are here only as labels for IO, and get
1374          * thrown away once the IO for this page is complete.  IO is
1375          * deemed complete once all buffers have been visited
1376          * (b_count==0) and are now unlocked. We must make sure that
1377          * only the _last_ buffer that decrements its count is the one
1378          * that free's the page..
1379          */
1380         save_flags(flags);
1381         cli();
1382         bh->b_count--;
1383         tmp = bh;
1384         do {
1385                 if (tmp->b_count)
1386                         goto still_busy;
1387                 tmp = tmp->b_this_page;
1388         } while (tmp != bh);
1389
1390         /* OK, the async IO on this page is complete. */
1391         free_async_buffers(bh);
1392         restore_flags(flags);
1393         clear_bit(PG_locked, &page->flags);
1394         wake_up(&page->wait);
1395         after_unlock_page(page);
1396         wake_up(&buffer_wait);
1397         return;
1398
1399 still_busy:
1400         restore_flags(flags);
1401         return;
1402
1403 not_locked:
1404         printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1405         return;
1406
1407 bad_count:
1408         printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1409         return;
1410 }
1411
1412 /*
1413  * Start I/O on a page.
1414  * This function expects the page to be locked and may return before I/O is complete.
1415  * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1416  */
1417 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1418 {
1419         struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1420         int block, nr;
1421
1422         if (!PageLocked(page))
1423                 panic("brw_page: page not locked for I/O");
1424         clear_bit(PG_uptodate, &page->flags);
1425         clear_bit(PG_error, &page->flags);
1426         /*
1427          * Allocate async buffer heads pointing to this page, just for I/O.
1428          * They do _not_ show up in the buffer hash table!
1429          * They are _not_ registered in page->buffers either!
1430          */
1431         bh = create_buffers(page_address(page), size, 1);
1432         if (!bh) {
1433                 /* WSH: exit here leaves page->count incremented */
1434                 clear_bit(PG_locked, &page->flags);
1435                 wake_up(&page->wait);
1436                 return -ENOMEM;
1437         }
1438         nr = 0;
1439         next = bh;
1440         do {
1441                 struct buffer_head * tmp;
1442                 block = *(b++);
1443
1444                 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1445                 set_bit(BH_Uptodate, &next->b_state);
1446
1447                 /*
1448                  * When we use bmap, we define block zero to represent
1449                  * a hole.  ll_rw_page, however, may legitimately
1450                  * access block zero, and we need to distinguish the
1451                  * two cases.
1452                  */
1453                 if (bmap && !block) {
1454                         memset(next->b_data, 0, size);
1455                         next->b_count--;
1456                         continue;
1457                 }
1458                 tmp = get_hash_table(dev, block, size);
1459                 if (tmp) {
1460                         if (!buffer_uptodate(tmp)) {
1461                                 if (rw == READ)
1462                                         ll_rw_block(READ, 1, &tmp);
1463                                 wait_on_buffer(tmp);
1464                         }
1465                         if (rw == READ)
1466                                 memcpy(next->b_data, tmp->b_data, size);
1467                         else {
1468                                 memcpy(tmp->b_data, next->b_data, size);
1469                                 mark_buffer_dirty(tmp, 0);
1470                         }
1471                         brelse(tmp);
1472                         next->b_count--;
1473                         continue;
1474                 }
1475                 if (rw == READ)
1476                         clear_bit(BH_Uptodate, &next->b_state);
1477                 else
1478                         set_bit(BH_Dirty, &next->b_state);
1479                 arr[nr++] = next;
1480         } while (prev = next, (next = next->b_this_page) != NULL);
1481         prev->b_this_page = bh;
1482
1483         if (nr) {
1484                 ll_rw_block(rw, nr, arr);
1485                 /* The rest of the work is done in mark_buffer_uptodate()
1486                  * and unlock_buffer(). */
1487         } else {
1488                 unsigned long flags;
1489                 clear_bit(PG_locked, &page->flags);
1490                 set_bit(PG_uptodate, &page->flags);
1491                 wake_up(&page->wait);
1492                 save_flags(flags);
1493                 cli();
1494                 free_async_buffers(bh);
1495                 restore_flags(flags);
1496                 after_unlock_page(page);
1497         }
1498         ++current->maj_flt;
1499         return 0;
1500 }
1501
1502 /*
1503  * This is called by end_request() when I/O has completed.
1504  */
1505 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1506 {
1507         if (on) {
1508                 struct buffer_head *tmp = bh;
1509                 set_bit(BH_Uptodate, &bh->b_state);
1510                 /* If a page has buffers and all these buffers are uptodate,
1511                  * then the page is uptodate. */
1512                 do {
1513                         if (!test_bit(BH_Uptodate, &tmp->b_state))
1514                                 return;
1515                         tmp=tmp->b_this_page;
1516                 } while (tmp && tmp != bh);
1517                 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1518                 return;
1519         }
1520         clear_bit(BH_Uptodate, &bh->b_state);
1521 }
1522
1523 /*
1524  * Generic "readpage" function for block devices that have the normal
1525  * bmap functionality. This is most of the block device filesystems.
1526  * Reads the page asynchronously --- the unlock_buffer() and
1527  * mark_buffer_uptodate() functions propagate buffer state into the
1528  * page struct once IO has completed.
1529  */
1530 int generic_readpage(struct file * file, struct page * page)
1531 {
1532         struct dentry *dentry = file->f_dentry;
1533         struct inode *inode = dentry->d_inode;
1534         unsigned long block;
1535         int *p, nr[PAGE_SIZE/512];
1536         int i;
1537
1538         atomic_inc(&page->count);
1539         set_bit(PG_locked, &page->flags);
1540         set_bit(PG_free_after, &page->flags);
1541
1542         i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1543         block = page->offset >> inode->i_sb->s_blocksize_bits;
1544         p = nr;
1545         do {
1546                 *p = inode->i_op->bmap(inode, block);
1547                 i--;
1548                 block++;
1549                 p++;
1550         } while (i > 0);
1551
1552         /* IO start */
1553         brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1554         return 0;
1555 }
1556
1557 /*
1558  * Try to increase the number of buffers available: the size argument
1559  * is used to determine what kind of buffers we want.
1560  */
1561 static int grow_buffers(int pri, int size)
1562 {
1563         unsigned long page;
1564         struct buffer_head *bh, *tmp;
1565         struct buffer_head * insert_point;
1566         int isize;
1567
1568         if ((size & 511) || (size > PAGE_SIZE)) {
1569                 printk("VFS: grow_buffers: size = %d\n",size);
1570                 return 0;
1571         }
1572
1573         if (!(page = __get_free_page(pri)))
1574                 return 0;
1575         bh = create_buffers(page, size, 0);
1576         if (!bh) {
1577                 free_page(page);
1578                 return 0;
1579         }
1580
1581         isize = BUFSIZE_INDEX(size);
1582         insert_point = free_list[isize];
1583
1584         tmp = bh;
1585         while (1) {
1586                 if (insert_point) {
1587                         tmp->b_next_free = insert_point->b_next_free;
1588                         tmp->b_prev_free = insert_point;
1589                         insert_point->b_next_free->b_prev_free = tmp;
1590                         insert_point->b_next_free = tmp;
1591                 } else {
1592                         tmp->b_prev_free = tmp;
1593                         tmp->b_next_free = tmp;
1594                 }
1595                 insert_point = tmp;
1596                 ++nr_buffers;
1597                 if (tmp->b_this_page)
1598                         tmp = tmp->b_this_page;
1599                 else
1600                         break;
1601         }
1602         tmp->b_this_page = bh;
1603         free_list[isize] = bh;
1604         mem_map[MAP_NR(page)].buffers = bh;
1605         buffermem += PAGE_SIZE;
1606         return 1;
1607 }
1608
1609
1610 /* =========== Reduce the buffer memory ============= */
1611
1612 static inline int buffer_waiting(struct buffer_head * bh)
1613 {
1614         return waitqueue_active(&bh->b_wait);
1615 }
1616
1617 /*
1618  * try_to_free_buffer() checks if all the buffers on this particular page
1619  * are unused, and free's the page if so.
1620  */
1621 int try_to_free_buffer(struct buffer_head * bh, struct buffer_head ** bhp,
1622                        int priority)
1623 {
1624         unsigned long page;
1625         struct buffer_head * tmp, * p;
1626
1627         *bhp = bh;
1628         page = (unsigned long) bh->b_data;
1629         page &= PAGE_MASK;
1630         tmp = bh;
1631         do {
1632                 if (!tmp)
1633                         return 0;
1634                 if (tmp->b_count || buffer_protected(tmp) ||
1635                     buffer_dirty(tmp) || buffer_locked(tmp) ||
1636                     buffer_waiting(tmp))
1637                         return 0;
1638                 if (priority && buffer_touched(tmp))
1639                         return 0;
1640                 tmp = tmp->b_this_page;
1641         } while (tmp != bh);
1642         tmp = bh;
1643         do {
1644                 p = tmp;
1645                 tmp = tmp->b_this_page;
1646                 nr_buffers--;
1647                 if (p == *bhp) {
1648                         *bhp = p->b_prev_free;
1649                         if (p == *bhp) /* Was this the last in the list? */
1650                                 *bhp = NULL;
1651                 }
1652                 remove_from_queues(p);
1653                 put_unused_buffer_head(p);
1654         } while (tmp != bh);
1655         buffermem -= PAGE_SIZE;
1656         mem_map[MAP_NR(page)].buffers = NULL;
1657         free_page(page);
1658         return 1;
1659 }
1660
1661 /* ================== Debugging =================== */
1662
1663 void show_buffers(void)
1664 {
1665         struct buffer_head * bh;
1666         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1667         int protected = 0;
1668         int nlist;
1669         static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1670
1671         printk("Buffer memory:   %6dkB\n",buffermem>>10);
1672         printk("Buffer heads:    %6d\n",nr_buffer_heads);
1673         printk("Buffer blocks:   %6d\n",nr_buffers);
1674
1675         for(nlist = 0; nlist < NR_LIST; nlist++) {
1676           found = locked = dirty = used = lastused = protected = 0;
1677           bh = lru_list[nlist];
1678           if(!bh) continue;
1679
1680           do {
1681                 found++;
1682                 if (buffer_locked(bh))
1683                         locked++;
1684                 if (buffer_protected(bh))
1685                         protected++;
1686                 if (buffer_dirty(bh))
1687                         dirty++;
1688                 if (bh->b_count)
1689                         used++, lastused = found;
1690                 bh = bh->b_next_free;
1691           } while (bh != lru_list[nlist]);
1692           printk("%8s: %d buffers, %d used (last=%d), "
1693                  "%d locked, %d protected, %d dirty\n",
1694                  buf_types[nlist], found, used, lastused,
1695                  locked, protected, dirty);
1696         };
1697 }
1698
1699
1700 /* ===================== Init ======================= */
1701
1702 /*
1703  * allocate the hash table and init the free list
1704  * Use gfp() for the hash table to decrease TLB misses, use
1705  * SLAB cache for buffer heads.
1706  */
1707 void buffer_init(void)
1708 {
1709         hash_table = (struct buffer_head **)
1710                 __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER);
1711         if (!hash_table)
1712                 panic("Failed to allocate buffer hash table\n");
1713         memset(hash_table,0,NR_HASH*sizeof(struct buffer_head *));
1714
1715         bh_cachep = kmem_cache_create("buffer_head",
1716                                       sizeof(struct buffer_head),
1717                                       0,
1718                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1719         if(!bh_cachep)
1720                 panic("Cannot create buffer head SLAB cache\n");
1721         /*
1722          * Allocate the reserved buffer heads.
1723          */
1724         while (nr_buffer_heads < NR_RESERVED) {
1725                 struct buffer_head * bh;
1726
1727                 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1728                 if (!bh)
1729                         break;
1730                 put_unused_buffer_head(bh);
1731                 nr_buffer_heads++;
1732         }
1733
1734         lru_list[BUF_CLEAN] = 0;
1735         grow_buffers(GFP_KERNEL, BLOCK_SIZE);
1736 }
1737
1738
1739 /* ====================== bdflush support =================== */
1740
1741 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1742  * response to dirty buffers.  Once this process is activated, we write back
1743  * a limited number of buffers to the disks and then go back to sleep again.
1744  */
1745 static struct wait_queue * bdflush_wait = NULL;
1746 static struct wait_queue * bdflush_done = NULL;
1747 struct task_struct *bdflush_tsk = 0;
1748
1749 void wakeup_bdflush(int wait)
1750 {
1751         if (current == bdflush_tsk)
1752                 return;
1753         wake_up(&bdflush_wait);
1754         if (wait) {
1755                 run_task_queue(&tq_disk);
1756                 sleep_on(&bdflush_done);
1757         }
1758 }
1759
1760
1761 /*
1762  * Here we attempt to write back old buffers.  We also try to flush inodes
1763  * and supers as well, since this function is essentially "update", and
1764  * otherwise there would be no way of ensuring that these quantities ever
1765  * get written back.  Ideally, we would have a timestamp on the inodes
1766  * and superblocks so that we could write back only the old ones as well
1767  */
1768
1769 asmlinkage int sync_old_buffers(void)
1770 {
1771         int i;
1772         int ndirty, nwritten;
1773         int nlist;
1774         int ncount;
1775         struct buffer_head * bh, *next;
1776
1777         sync_supers(0);
1778         sync_inodes(0);
1779
1780         ncount = 0;
1781 #ifdef DEBUG
1782         for(nlist = 0; nlist < NR_LIST; nlist++)
1783 #else
1784         for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1785 #endif
1786         {
1787                 ndirty = 0;
1788                 nwritten = 0;
1789         repeat:
1790
1791                 bh = lru_list[nlist];
1792                 if(bh)
1793                          for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1794                                  /* We may have stalled while waiting for I/O to complete. */
1795                                  if(bh->b_list != nlist) goto repeat;
1796                                  next = bh->b_next_free;
1797                                  if(!lru_list[nlist]) {
1798                                          printk("Dirty list empty %d\n", i);
1799                                          break;
1800                                  }
1801
1802                                  /* Clean buffer on dirty list?  Refile it */
1803                                  if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1804                                   {
1805                                           refile_buffer(bh);
1806                                           continue;
1807                                   }
1808
1809                                  if (buffer_locked(bh) || !buffer_dirty(bh))
1810                                           continue;
1811                                  ndirty++;
1812                                  if(bh->b_flushtime > jiffies) continue;
1813                                  nwritten++;
1814                                  next->b_count++;
1815                                  bh->b_count++;
1816                                  bh->b_flushtime = 0;
1817 #ifdef DEBUG
1818                                  if(nlist != BUF_DIRTY) ncount++;
1819 #endif
1820                                  ll_rw_block(WRITE, 1, &bh);
1821                                  bh->b_count--;
1822                                  next->b_count--;
1823                          }
1824         }
1825         run_task_queue(&tq_disk);
1826 #ifdef DEBUG
1827         if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1828         printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1829 #endif
1830         run_task_queue(&tq_disk);
1831         return 0;
1832 }
1833
1834
1835 /* This is the interface to bdflush.  As we get more sophisticated, we can
1836  * pass tuning parameters to this "process", to adjust how it behaves.
1837  * We would want to verify each parameter, however, to make sure that it
1838  * is reasonable. */
1839
1840 asmlinkage int sys_bdflush(int func, long data)
1841 {
1842         int i, error = -EPERM;
1843
1844         lock_kernel();
1845         if (!suser())
1846                 goto out;
1847
1848         if (func == 1) {
1849                  error = sync_old_buffers();
1850                  goto out;
1851         }
1852
1853         /* Basically func 1 means read param 1, 2 means write param 1, etc */
1854         if (func >= 2) {
1855                 i = (func-2) >> 1;
1856                 error = -EINVAL;
1857                 if (i < 0 || i >= N_PARAM)
1858                         goto out;
1859                 if((func & 1) == 0) {
1860                         error = put_user(bdf_prm.data[i], (int*)data);
1861                         goto out;
1862                 }
1863                 if (data < bdflush_min[i] || data > bdflush_max[i])
1864                         goto out;
1865                 bdf_prm.data[i] = data;
1866                 error = 0;
1867                 goto out;
1868         };
1869
1870         /* Having func 0 used to launch the actual bdflush and then never
1871          * return (unless explicitly killed). We return zero here to
1872          * remain semi-compatible with present update(8) programs.
1873          */
1874         error = 0;
1875 out:
1876         unlock_kernel();
1877         return error;
1878 }
1879
1880 /* This is the actual bdflush daemon itself. It used to be started from
1881  * the syscall above, but now we launch it ourselves internally with
1882  * kernel_thread(...)  directly after the first thread in init/main.c */
1883
1884 /* To prevent deadlocks for a loop device:
1885  * 1) Do non-blocking writes to loop (avoids deadlock with running
1886  *      out of request blocks).
1887  * 2) But do a blocking write if the only dirty buffers are loop buffers
1888  *      (otherwise we go into an infinite busy-loop).
1889  * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1890  *      with running out of free buffers for loop's "real" device).
1891 */
1892 int bdflush(void * unused)
1893 {
1894         int i;
1895         int ndirty;
1896         int nlist;
1897         int ncount;
1898         struct buffer_head * bh, *next;
1899         int major;
1900         int wrta_cmd = WRITEA;  /* non-blocking write for LOOP */
1901
1902         /*
1903          *      We have a bare-bones task_struct, and really should fill
1904          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
1905          *      display semi-sane things. Not real crucial though...
1906          */
1907
1908         current->session = 1;
1909         current->pgrp = 1;
1910         sprintf(current->comm, "kflushd");
1911         bdflush_tsk = current;
1912
1913         /*
1914          *      As a kernel thread we want to tamper with system buffers
1915          *      and other internals and thus be subject to the SMP locking
1916          *      rules. (On a uniprocessor box this does nothing).
1917          */
1918         lock_kernel();
1919
1920         for (;;) {
1921 #ifdef DEBUG
1922                 printk("bdflush() activated...");
1923 #endif
1924
1925                 CHECK_EMERGENCY_SYNC
1926
1927                 ncount = 0;
1928 #ifdef DEBUG
1929                 for(nlist = 0; nlist < NR_LIST; nlist++)
1930 #else
1931                 for(nlist = BUF_DIRTY; nlist <= BUF_DIRTY; nlist++)
1932 #endif
1933                  {
1934                          ndirty = 0;
1935                          refilled = 0;
1936                  repeat:
1937
1938                          bh = lru_list[nlist];
1939                          if(bh)
1940                                   for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1941                                        bh = next) {
1942                                           /* We may have stalled while waiting for I/O to complete. */
1943                                           if(bh->b_list != nlist) goto repeat;
1944                                           next = bh->b_next_free;
1945                                           if(!lru_list[nlist]) {
1946                                                   printk("Dirty list empty %d\n", i);
1947                                                   break;
1948                                           }
1949
1950                                           /* Clean buffer on dirty list?  Refile it */
1951                                           if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh))
1952                                            {
1953                                                    refile_buffer(bh);
1954                                                    continue;
1955                                            }
1956
1957                                           if (buffer_locked(bh) || !buffer_dirty(bh))
1958                                                    continue;
1959                                           major = MAJOR(bh->b_dev);
1960                                           /* Should we write back buffers that are shared or not??
1961                                              currently dirty buffers are not shared, so it does not matter */
1962                                           if (refilled && major == LOOP_MAJOR)
1963                                                    continue;
1964                                           next->b_count++;
1965                                           bh->b_count++;
1966                                           ndirty++;
1967                                           bh->b_flushtime = 0;
1968                                           if (major == LOOP_MAJOR) {
1969                                                   ll_rw_block(wrta_cmd,1, &bh);
1970                                                   wrta_cmd = WRITEA;
1971                                                   if (buffer_dirty(bh))
1972                                                           --ndirty;
1973                                           }
1974                                           else
1975                                           ll_rw_block(WRITE, 1, &bh);
1976 #ifdef DEBUG
1977                                           if(nlist != BUF_DIRTY) ncount++;
1978 #endif
1979                                           bh->b_count--;
1980                                           next->b_count--;
1981                                   }
1982                  }
1983 #ifdef DEBUG
1984                 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
1985                 printk("sleeping again.\n");
1986 #endif
1987                 /* If we didn't write anything, but there are still
1988                  * dirty buffers, then make the next write to a
1989                  * loop device to be a blocking write.
1990                  * This lets us block--which we _must_ do! */
1991                 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1992                         wrta_cmd = WRITE;
1993                         continue;
1994                 }
1995                 run_task_queue(&tq_disk);
1996                 wake_up(&bdflush_done);
1997
1998                 /* If there are still a lot of dirty buffers around, skip the sleep
1999                    and flush some more */
2000                 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
2001                         spin_lock_irq(&current->sigmask_lock);
2002                         flush_signals(current);
2003                         spin_unlock_irq(&current->sigmask_lock);
2004
2005                         interruptible_sleep_on(&bdflush_wait);
2006                 }
2007         }
2008 }