fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 #include <linux/malloc.h>
  28 #include <linux/locks.h>
  29 #include <linux/errno.h>
  30 #include <linux/swap.h>
  31 #include <linux/swapctl.h>
  32 #include <linux/smp_lock.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/blkdev.h>
  35 #include <linux/sysrq.h>
  36 #include <linux/file.h>
  37 #include <linux/init.h>
  38 #include <linux/quotaops.h>
  39
  40 #include <asm/uaccess.h>
  41 #include <asm/io.h>
  42 #include <asm/bitops.h>
  43
  44 #define NR_SIZES 7
  45 static char buffersize_index[65] =
  46 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  47   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  48   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  49  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  50   6};
  51
  52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  56                                              number of unused buffer heads */
  57
  58 /*
  59  * Hash table mask..
  60  */
  61 static unsigned long bh_hash_mask = 0;
  62
  63 static int grow_buffers(int size);
  64
  65 static struct buffer_head ** hash_table;
  66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
  67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
  68
  69 static kmem_cache_t *bh_cachep;
  70
  71 static struct buffer_head * unused_list = NULL;
  72 static struct buffer_head * reuse_list = NULL;
  73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  74
  75 static int nr_buffers = 0;
  76 static int nr_buffers_type[NR_LIST] = {0,};
  77 static int nr_buffer_heads = 0;
  78 static int nr_unused_buffer_heads = 0;
  79 static int nr_hashed_buffers = 0;
  80
  81 /* This is used by some architectures to estimate available memory. */
  82 int buffermem = 0;
  83
  84 /* Here is the parameter block for the bdflush process. If you add or
  85  * remove any of the parameters, make sure to update kernel/sysctl.c.
  86  */
  87
  88 #define N_PARAM 9
  89
  90 /* The dummy values in this structure are left in there for compatibility
  91  * with old programs that play with the /proc entries.
  92  */
  93 union bdflush_param{
  94         struct {
  95                 int nfract;  /* Percentage of buffer cache dirty to
  96                                 activate bdflush */
  97                 int ndirty;  /* Maximum number of dirty blocks to write out per
  98                                 wake-cycle */
  99                 int nrefill; /* Number of clean buffers to try to obtain
 100                                 each time we call refill */
 101                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 102                                   when trying to refill buffers. */
 103                 int interval;  /* Interval (seconds) between spontaneous
 104                                   bdflush runs */
 105                 int age_buffer;  /* Time for normal buffer to age before
 106                                     we flush it */
 107                 int age_super;  /* Time for superblock to age before we
 108                                    flush it */
 109                 int dummy2;    /* unused */
 110                 int dummy3;    /* unused */
 111         } b_un;
 112         unsigned int data[N_PARAM];
 113 } bdf_prm = {{40, 500, 64, 256, 5, 30*HZ, 5*HZ, 1884, 2}};
 114
 115 /* These are the min and max parameter values that we will allow to be assigned */
 116 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  1,   1*HZ,   1*HZ, 1, 1};
 117 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
 118
 119 void wakeup_bdflush(int);
 120
 121 /*
 122  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 123  * and getting rid of the cli-sti pairs. The wait-queue routines still
 124  * need cli-sti, but now it's just a couple of 386 instructions or so.
 125  *
 126  * Note that the real wait_on_buffer() is an inline function that checks
 127  * if 'b_wait' is set before calling this, so that the queues aren't set
 128  * up unnecessarily.
 129  */
 130 void __wait_on_buffer(struct buffer_head * bh)
 131 {
 132         struct task_struct *tsk = current;
 133         DECLARE_WAITQUEUE(wait, tsk);
 134
 135         bh->b_count++;
 136         add_wait_queue(&bh->b_wait, &wait);
 137 repeat:
 138         tsk->state = TASK_UNINTERRUPTIBLE;
 139         run_task_queue(&tq_disk);
 140         if (buffer_locked(bh)) {
 141                 schedule();
 142                 goto repeat;
 143         }
 144         tsk->state = TASK_RUNNING;
 145         remove_wait_queue(&bh->b_wait, &wait);
 146         bh->b_count--;
 147 }
 148
 149 /* Call sync_buffers with wait!=0 to ensure that the call does not
 150  * return until all buffer writes have completed.  Sync() may return
 151  * before the writes have finished; fsync() may not.
 152  */
 153
 154 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 155  * spontaneously dirty themselves without ever brelse being called.
 156  * We will ultimately want to put these in a separate list, but for
 157  * now we search all of the lists for dirty buffers.
 158  */
 159 static int sync_buffers(kdev_t dev, int wait)
 160 {
 161         int i, retry, pass = 0, err = 0;
 162         struct buffer_head * bh, *next;
 163
 164         /* One pass for no-wait, three for wait:
 165          * 0) write out all dirty, unlocked buffers;
 166          * 1) write out all dirty buffers, waiting if locked;
 167          * 2) wait for completion by waiting for all buffers to unlock.
 168          */
 169         do {
 170                 retry = 0;
 171 repeat:
 172                 /* We search all lists as a failsafe mechanism, not because we expect
 173                  * there to be dirty buffers on any of the other lists.
 174                  */
 175                 bh = lru_list[BUF_DIRTY];
 176                 if (!bh)
 177                         goto repeat2;
 178                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 179                         if (bh->b_list != BUF_DIRTY)
 180                                 goto repeat;
 181                         next = bh->b_next_free;
 182                         if (!lru_list[BUF_DIRTY])
 183                                 break;
 184                         if (dev && bh->b_dev != dev)
 185                                 continue;
 186                         if (buffer_locked(bh)) {
 187                                 /* Buffer is locked; skip it unless wait is
 188                                  * requested AND pass > 0.
 189                                  */
 190                                 if (!wait || !pass) {
 191                                         retry = 1;
 192                                         continue;
 193                                 }
 194                                 wait_on_buffer (bh);
 195                                 goto repeat;
 196                         }
 197
 198                         /* If an unlocked buffer is not uptodate, there has
 199                          * been an IO error. Skip it.
 200                          */
 201                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 202                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 203                                 err = -EIO;
 204                                 continue;
 205                         }
 206
 207                         /* Don't write clean buffers.  Don't write ANY buffers
 208                          * on the third pass.
 209                          */
 210                         if (!buffer_dirty(bh) || pass >= 2)
 211                                 continue;
 212
 213                         /* Don't bother about locked buffers.
 214                          *
 215                          * XXX We checked if it was locked above and there is no
 216                          * XXX way we could have slept in between. -DaveM
 217                          */
 218                         if (buffer_locked(bh))
 219                                 continue;
 220                         bh->b_count++;
 221                         next->b_count++;
 222                         bh->b_flushtime = 0;
 223                         ll_rw_block(WRITE, 1, &bh);
 224                         bh->b_count--;
 225                         next->b_count--;
 226                         retry = 1;
 227                 }
 228
 229     repeat2:
 230                 bh = lru_list[BUF_LOCKED];
 231                 if (!bh)
 232                         break;
 233                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 234                         if (bh->b_list != BUF_LOCKED)
 235                                 goto repeat2;
 236                         next = bh->b_next_free;
 237                         if (!lru_list[BUF_LOCKED])
 238                                 break;
 239                         if (dev && bh->b_dev != dev)
 240                                 continue;
 241                         if (buffer_locked(bh)) {
 242                                 /* Buffer is locked; skip it unless wait is
 243                                  * requested AND pass > 0.
 244                                  */
 245                                 if (!wait || !pass) {
 246                                         retry = 1;
 247                                         continue;
 248                                 }
 249                                 wait_on_buffer (bh);
 250                                 goto repeat2;
 251                         }
 252                 }
 253
 254                 /* If we are waiting for the sync to succeed, and if any dirty
 255                  * blocks were written, then repeat; on the second pass, only
 256                  * wait for buffers being written (do not pass to write any
 257                  * more buffers on the second pass).
 258                  */
 259         } while (wait && retry && ++pass<=2);
 260         return err;
 261 }
 262
 263 void sync_dev(kdev_t dev)
 264 {
 265         sync_buffers(dev, 0);
 266         sync_supers(dev);
 267         sync_inodes(dev);
 268         sync_buffers(dev, 0);
 269         DQUOT_SYNC(dev);
 270         /*
 271          * FIXME(eric) we need to sync the physical devices here.
 272          * This is because some (scsi) controllers have huge amounts of
 273          * cache onboard (hundreds of Mb), and we need to instruct
 274          * them to commit all of the dirty memory to disk, and we should
 275          * not return until this has happened.
 276          *
 277          * This would need to get implemented by going through the assorted
 278          * layers so that each block major number can be synced, and this
 279          * would call down into the upper and mid-layer scsi.
 280          */
 281 }
 282
 283 int fsync_dev(kdev_t dev)
 284 {
 285         sync_buffers(dev, 0);
 286         sync_supers(dev);
 287         sync_inodes(dev);
 288         DQUOT_SYNC(dev);
 289         return sync_buffers(dev, 1);
 290 }
 291
 292 asmlinkage int sys_sync(void)
 293 {
 294         lock_kernel();
 295         fsync_dev(0);
 296         unlock_kernel();
 297         return 0;
 298 }
 299
 300 /*
 301  *      filp may be NULL if called via the msync of a vma.
 302  */
 303
 304 int file_fsync(struct file *filp, struct dentry *dentry)
 305 {
 306         struct inode * inode = dentry->d_inode;
 307         struct super_block * sb;
 308         kdev_t dev;
 309
 310         /* sync the inode to buffers */
 311         write_inode_now(inode);
 312
 313         /* sync the superblock to buffers */
 314         sb = inode->i_sb;
 315         wait_on_super(sb);
 316         if (sb->s_op && sb->s_op->write_super)
 317                 sb->s_op->write_super(sb);
 318
 319         /* .. finally sync the buffers to disk */
 320         dev = inode->i_dev;
 321         return sync_buffers(dev, 1);
 322 }
 323
 324 asmlinkage int sys_fsync(unsigned int fd)
 325 {
 326         struct file * file;
 327         struct dentry * dentry;
 328         struct inode * inode;
 329         int err;
 330
 331         lock_kernel();
 332         err = -EBADF;
 333         file = fget(fd);
 334         if (!file)
 335                 goto out;
 336
 337         dentry = file->f_dentry;
 338         if (!dentry)
 339                 goto out_putf;
 340
 341         inode = dentry->d_inode;
 342         if (!inode)
 343                 goto out_putf;
 344
 345         err = -EINVAL;
 346         if (!file->f_op || !file->f_op->fsync)
 347                 goto out_putf;
 348
 349         /* We need to protect against concurrent writers.. */
 350         down(&inode->i_sem);
 351         err = file->f_op->fsync(file, dentry);
 352         up(&inode->i_sem);
 353
 354 out_putf:
 355         fput(file);
 356 out:
 357         unlock_kernel();
 358         return err;
 359 }
 360
 361 asmlinkage int sys_fdatasync(unsigned int fd)
 362 {
 363         struct file * file;
 364         struct dentry * dentry;
 365         struct inode * inode;
 366         int err;
 367
 368         lock_kernel();
 369         err = -EBADF;
 370         file = fget(fd);
 371         if (!file)
 372                 goto out;
 373
 374         dentry = file->f_dentry;
 375         if (!dentry)
 376                 goto out_putf;
 377
 378         inode = dentry->d_inode;
 379         if (!inode)
 380                 goto out_putf;
 381
 382         err = -EINVAL;
 383         if (!file->f_op || !file->f_op->fsync)
 384                 goto out_putf;
 385
 386         /* this needs further work, at the moment it is identical to fsync() */
 387         down(&inode->i_sem);
 388         err = file->f_op->fsync(file, dentry);
 389         up(&inode->i_sem);
 390
 391 out_putf:
 392         fput(file);
 393 out:
 394         unlock_kernel();
 395         return err;
 396 }
 397
 398 void invalidate_buffers(kdev_t dev)
 399 {
 400         int i;
 401         int nlist;
 402         struct buffer_head * bh;
 403
 404         for(nlist = 0; nlist < NR_LIST; nlist++) {
 405                 bh = lru_list[nlist];
 406                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 407                         if (bh->b_dev != dev)
 408                                 continue;
 409                         wait_on_buffer(bh);
 410                         if (bh->b_dev != dev)
 411                                 continue;
 412                         if (bh->b_count)
 413                                 continue;
 414                         bh->b_flushtime = 0;
 415                         clear_bit(BH_Protected, &bh->b_state);
 416                         clear_bit(BH_Uptodate, &bh->b_state);
 417                         clear_bit(BH_Dirty, &bh->b_state);
 418                         clear_bit(BH_Req, &bh->b_state);
 419                 }
 420         }
 421 }
 422
 423 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
 424 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 425
 426 static inline void remove_from_hash_queue(struct buffer_head * bh)
 427 {
 428         struct buffer_head **pprev = bh->b_pprev;
 429         if (pprev) {
 430                 struct buffer_head * next = bh->b_next;
 431                 if (next) {
 432                         next->b_pprev = pprev;
 433                         bh->b_next = NULL;
 434                 }
 435                 *pprev = next;
 436                 bh->b_pprev = NULL;
 437         }
 438         nr_hashed_buffers--;
 439 }
 440
 441 static inline void remove_from_lru_list(struct buffer_head * bh)
 442 {
 443         if (!(bh->b_prev_free) || !(bh->b_next_free))
 444                 panic("VFS: LRU block list corrupted");
 445         if (bh->b_dev == B_FREE)
 446                 panic("LRU list corrupted");
 447         bh->b_prev_free->b_next_free = bh->b_next_free;
 448         bh->b_next_free->b_prev_free = bh->b_prev_free;
 449
 450         if (lru_list[bh->b_list] == bh)
 451                  lru_list[bh->b_list] = bh->b_next_free;
 452         if (lru_list[bh->b_list] == bh)
 453                  lru_list[bh->b_list] = NULL;
 454         bh->b_next_free = bh->b_prev_free = NULL;
 455 }
 456
 457 static inline void remove_from_free_list(struct buffer_head * bh)
 458 {
 459         int isize = BUFSIZE_INDEX(bh->b_size);
 460         if (!(bh->b_prev_free) || !(bh->b_next_free))
 461                 panic("VFS: Free block list corrupted");
 462         if(bh->b_dev != B_FREE)
 463                 panic("Free list corrupted");
 464         if(!free_list[isize])
 465                 panic("Free list empty");
 466         if(bh->b_next_free == bh)
 467                  free_list[isize] = NULL;
 468         else {
 469                 bh->b_prev_free->b_next_free = bh->b_next_free;
 470                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 471                 if (free_list[isize] == bh)
 472                          free_list[isize] = bh->b_next_free;
 473         }
 474         bh->b_next_free = bh->b_prev_free = NULL;
 475 }
 476
 477 static void remove_from_queues(struct buffer_head * bh)
 478 {
 479         if(bh->b_dev == B_FREE) {
 480                 remove_from_free_list(bh); /* Free list entries should not be
 481                                               in the hash queue */
 482                 return;
 483         }
 484         nr_buffers_type[bh->b_list]--;
 485         remove_from_hash_queue(bh);
 486         remove_from_lru_list(bh);
 487 }
 488
 489 static inline void put_last_free(struct buffer_head * bh)
 490 {
 491         if (bh) {
 492                 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 493
 494                 bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 495
 496                 /* Add to back of free list. */
 497                 if(!*bhp) {
 498                         *bhp = bh;
 499                         bh->b_prev_free = bh;
 500                 }
 501
 502                 bh->b_next_free = *bhp;
 503                 bh->b_prev_free = (*bhp)->b_prev_free;
 504                 (*bhp)->b_prev_free->b_next_free = bh;
 505                 (*bhp)->b_prev_free = bh;
 506         }
 507 }
 508
 509 static void insert_into_queues(struct buffer_head * bh)
 510 {
 511         /* put at end of free list */
 512         if(bh->b_dev == B_FREE) {
 513                 put_last_free(bh);
 514         } else {
 515                 struct buffer_head **bhp = &lru_list[bh->b_list];
 516
 517                 if(!*bhp) {
 518                         *bhp = bh;
 519                         bh->b_prev_free = bh;
 520                 }
 521
 522                 if (bh->b_next_free)
 523                         panic("VFS: buffer LRU pointers corrupted");
 524
 525                 bh->b_next_free = *bhp;
 526                 bh->b_prev_free = (*bhp)->b_prev_free;
 527                 (*bhp)->b_prev_free->b_next_free = bh;
 528                 (*bhp)->b_prev_free = bh;
 529
 530                 nr_buffers_type[bh->b_list]++;
 531
 532                 /* Put the buffer in new hash-queue if it has a device. */
 533                 bh->b_next = NULL;
 534                 bh->b_pprev = NULL;
 535                 if (bh->b_dev) {
 536                         struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
 537                         struct buffer_head *next = *bhp;
 538
 539                         if (next) {
 540                                 bh->b_next = next;
 541                                 next->b_pprev = &bh->b_next;
 542                         }
 543                         *bhp = bh;
 544                         bh->b_pprev = bhp;
 545                 }
 546                 nr_hashed_buffers++;
 547         }
 548 }
 549
 550 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 551 {
 552         struct buffer_head * next;
 553
 554         next = hash(dev,block);
 555         for (;;) {
 556                 struct buffer_head *tmp = next;
 557                 if (!next)
 558                         break;
 559                 next = tmp->b_next;
 560                 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
 561                         continue;
 562                 next = tmp;
 563                 break;
 564         }
 565         return next;
 566 }
 567
 568 /*
 569  * Why like this, I hear you say... The reason is race-conditions.
 570  * As we don't lock buffers (unless we are reading them, that is),
 571  * something might happen to it while we sleep (ie a read-error
 572  * will force it bad). This shouldn't really happen currently, but
 573  * the code is ready.
 574  */
 575 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 576 {
 577         struct buffer_head * bh;
 578         bh = find_buffer(dev,block,size);
 579         if (bh)
 580                 bh->b_count++;
 581         return bh;
 582 }
 583
 584 unsigned int get_hardblocksize(kdev_t dev)
 585 {
 586         /*
 587          * Get the hard sector size for the given device.  If we don't know
 588          * what it is, return 0.
 589          */
 590         if (hardsect_size[MAJOR(dev)] != NULL) {
 591                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 592                 if (blksize != 0)
 593                         return blksize;
 594         }
 595
 596         /*
 597          * We don't know what the hardware sector size for this device is.
 598          * Return 0 indicating that we don't know.
 599          */
 600         return 0;
 601 }
 602
 603 void set_blocksize(kdev_t dev, int size)
 604 {
 605         extern int *blksize_size[];
 606         int i, nlist;
 607         struct buffer_head * bh, *bhnext;
 608
 609         if (!blksize_size[MAJOR(dev)])
 610                 return;
 611
 612         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 613         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 614                 panic("Invalid blocksize passed to set_blocksize");
 615
 616         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 617                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 618                 return;
 619         }
 620         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 621                 return;
 622         sync_buffers(dev, 2);
 623         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 624
 625         /* We need to be quite careful how we do this - we are moving entries
 626          * around on the free list, and we can get in a loop if we are not careful.
 627          */
 628         for(nlist = 0; nlist < NR_LIST; nlist++) {
 629                 bh = lru_list[nlist];
 630                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 631                         if(!bh)
 632                                 break;
 633
 634                         bhnext = bh->b_next_free;
 635                         if (bh->b_dev != dev)
 636                                  continue;
 637                         if (bh->b_size == size)
 638                                  continue;
 639                         bhnext->b_count++;
 640                         wait_on_buffer(bh);
 641                         bhnext->b_count--;
 642                         if (bh->b_dev == dev && bh->b_size != size) {
 643                                 clear_bit(BH_Dirty, &bh->b_state);
 644                                 clear_bit(BH_Uptodate, &bh->b_state);
 645                                 clear_bit(BH_Req, &bh->b_state);
 646                                 bh->b_flushtime = 0;
 647                         }
 648                         remove_from_hash_queue(bh);
 649                 }
 650         }
 651 }
 652
 653 /*
 654  * We used to try various strange things. Let's not.
 655  */
 656 static void refill_freelist(int size)
 657 {
 658         if (!grow_buffers(size)) {
 659                 wakeup_bdflush(1);
 660                 current->policy |= SCHED_YIELD;
 661                 schedule();
 662         }
 663 }
 664
 665 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 666                  bh_end_io_t *handler, void *dev_id)
 667 {
 668         bh->b_count = 1;
 669         bh->b_list = BUF_CLEAN;
 670         bh->b_flushtime = 0;
 671         bh->b_dev = dev;
 672         bh->b_blocknr = block;
 673         bh->b_end_io = handler;
 674         bh->b_dev_id = dev_id;
 675 }
 676
 677 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 678 {
 679         mark_buffer_uptodate(bh, uptodate);
 680         unlock_buffer(bh);
 681 }
 682
 683 /*
 684  * Ok, this is getblk, and it isn't very clear, again to hinder
 685  * race-conditions. Most of the code is seldom used, (ie repeating),
 686  * so it should be much more efficient than it looks.
 687  *
 688  * The algorithm is changed: hopefully better, and an elusive bug removed.
 689  *
 690  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 691  * when the filesystem starts to get full of dirty blocks (I hope).
 692  */
 693 struct buffer_head * getblk(kdev_t dev, int block, int size)
 694 {
 695         struct buffer_head * bh;
 696         int isize;
 697
 698 repeat:
 699         bh = get_hash_table(dev, block, size);
 700         if (bh) {
 701                 if (!buffer_dirty(bh)) {
 702                         bh->b_flushtime = 0;
 703                 }
 704                 return bh;
 705         }
 706
 707         isize = BUFSIZE_INDEX(size);
 708 get_free:
 709         bh = free_list[isize];
 710         if (!bh)
 711                 goto refill;
 712         remove_from_free_list(bh);
 713
 714         /* OK, FINALLY we know that this buffer is the only one of its kind,
 715          * and that it's unused (b_count=0), unlocked, and clean.
 716          */
 717         init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
 718         bh->b_state=0;
 719         insert_into_queues(bh);
 720         return bh;
 721
 722         /*
 723          * If we block while refilling the free list, somebody may
 724          * create the buffer first ... search the hashes again.
 725          */
 726 refill:
 727         refill_freelist(size);
 728         if (!find_buffer(dev,block,size))
 729                 goto get_free;
 730         goto repeat;
 731 }
 732
 733 void set_writetime(struct buffer_head * buf, int flag)
 734 {
 735         int newtime;
 736
 737         if (buffer_dirty(buf)) {
 738                 /* Move buffer to dirty list if jiffies is clear. */
 739                 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
 740                                      bdf_prm.b_un.age_buffer);
 741                 if(!buf->b_flushtime || buf->b_flushtime > newtime)
 742                          buf->b_flushtime = newtime;
 743         } else {
 744                 buf->b_flushtime = 0;
 745         }
 746 }
 747
 748
 749 /*
 750  * Put a buffer into the appropriate list, without side-effects.
 751  */
 752 static inline void file_buffer(struct buffer_head *bh, int list)
 753 {
 754         remove_from_queues(bh);
 755         bh->b_list = list;
 756         insert_into_queues(bh);
 757 }
 758
 759 /*
 760  * A buffer may need to be moved from one buffer list to another
 761  * (e.g. in case it is not shared any more). Handle this.
 762  */
 763 void refile_buffer(struct buffer_head * buf)
 764 {
 765         int dispose;
 766
 767         if(buf->b_dev == B_FREE) {
 768                 printk("Attempt to refile free buffer\n");
 769                 return;
 770         }
 771         if (buffer_dirty(buf))
 772                 dispose = BUF_DIRTY;
 773         else if (buffer_locked(buf))
 774                 dispose = BUF_LOCKED;
 775         else
 776                 dispose = BUF_CLEAN;
 777         if(dispose != buf->b_list) {
 778                 file_buffer(buf, dispose);
 779                 if(dispose == BUF_DIRTY) {
 780                         int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
 781
 782                         /* This buffer is dirty, maybe we need to start flushing.
 783                          * If too high a percentage of the buffers are dirty...
 784                          */
 785                         if (nr_buffers_type[BUF_DIRTY] > too_many)
 786                                 wakeup_bdflush(1);
 787
 788                         /* If this is a loop device, and
 789                          * more than half of the buffers are dirty...
 790                          * (Prevents no-free-buffers deadlock with loop device.)
 791                          */
 792                         if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
 793                             nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
 794                                 wakeup_bdflush(1);
 795                 }
 796         }
 797 }
 798
 799 /*
 800  * Release a buffer head
 801  */
 802 void __brelse(struct buffer_head * buf)
 803 {
 804         /* If dirty, mark the time this buffer should be written back. */
 805         set_writetime(buf, 0);
 806         refile_buffer(buf);
 807         touch_buffer(buf);
 808
 809         if (buf->b_count) {
 810                 buf->b_count--;
 811                 return;
 812         }
 813         printk("VFS: brelse: Trying to free free buffer\n");
 814 }
 815
 816 /*
 817  * bforget() is like brelse(), except it puts the buffer on the
 818  * free list if it can.. We can NOT free the buffer if:
 819  *  - there are other users of it
 820  *  - it is locked and thus can have active IO
 821  */
 822 void __bforget(struct buffer_head * buf)
 823 {
 824         if (buf->b_count != 1 || buffer_locked(buf)) {
 825                 __brelse(buf);
 826                 return;
 827         }
 828         buf->b_count = 0;
 829         buf->b_state = 0;
 830         remove_from_queues(buf);
 831         put_last_free(buf);
 832 }
 833
 834 /*
 835  * bread() reads a specified block and returns the buffer that contains
 836  * it. It returns NULL if the block was unreadable.
 837  */
 838 struct buffer_head * bread(kdev_t dev, int block, int size)
 839 {
 840         struct buffer_head * bh;
 841
 842         bh = getblk(dev, block, size);
 843         if (buffer_uptodate(bh))
 844                 return bh;
 845         ll_rw_block(READ, 1, &bh);
 846         wait_on_buffer(bh);
 847         if (buffer_uptodate(bh))
 848                 return bh;
 849         brelse(bh);
 850         return NULL;
 851 }
 852
 853 /*
 854  * Ok, breada can be used as bread, but additionally to mark other
 855  * blocks for reading as well. End the argument list with a negative
 856  * number.
 857  */
 858
 859 #define NBUF 16
 860
 861 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 862         unsigned int pos, unsigned int filesize)
 863 {
 864         struct buffer_head * bhlist[NBUF];
 865         unsigned int blocks;
 866         struct buffer_head * bh;
 867         int index;
 868         int i, j;
 869
 870         if (pos >= filesize)
 871                 return NULL;
 872
 873         if (block < 0)
 874                 return NULL;
 875
 876         bh = getblk(dev, block, bufsize);
 877         index = BUFSIZE_INDEX(bh->b_size);
 878
 879         if (buffer_uptodate(bh))
 880                 return(bh);
 881         else ll_rw_block(READ, 1, &bh);
 882
 883         blocks = (filesize - pos) >> (9+index);
 884
 885         if (blocks < (read_ahead[MAJOR(dev)] >> index))
 886                 blocks = read_ahead[MAJOR(dev)] >> index;
 887         if (blocks > NBUF)
 888                 blocks = NBUF;
 889
 890 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 891
 892
 893         bhlist[0] = bh;
 894         j = 1;
 895         for(i=1; i<blocks; i++) {
 896                 bh = getblk(dev,block+i,bufsize);
 897                 if (buffer_uptodate(bh)) {
 898                         brelse(bh);
 899                         break;
 900                 }
 901                 else bhlist[j++] = bh;
 902         }
 903
 904         /* Request the read for these buffers, and then release them. */
 905         if (j>1)
 906                 ll_rw_block(READA, (j-1), bhlist+1);
 907         for(i=1; i<j; i++)
 908                 brelse(bhlist[i]);
 909
 910         /* Wait for this buffer, and then continue on. */
 911         bh = bhlist[0];
 912         wait_on_buffer(bh);
 913         if (buffer_uptodate(bh))
 914                 return bh;
 915         brelse(bh);
 916         return NULL;
 917 }
 918
 919 /*
 920  * Note: the caller should wake up the buffer_wait list if needed.
 921  */
 922 static void put_unused_buffer_head(struct buffer_head * bh)
 923 {
 924         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
 925                 nr_buffer_heads--;
 926                 kmem_cache_free(bh_cachep, bh);
 927                 return;
 928         }
 929
 930         memset(bh,0,sizeof(*bh));
 931         init_waitqueue_head(&bh->b_wait);
 932         nr_unused_buffer_heads++;
 933         bh->b_next_free = unused_list;
 934         unused_list = bh;
 935 }
 936
 937 /*
 938  * We can't put completed temporary IO buffer_heads directly onto the
 939  * unused_list when they become unlocked, since the device driver
 940  * end_request routines still expect access to the buffer_head's
 941  * fields after the final unlock.  So, the device driver puts them on
 942  * the reuse_list instead once IO completes, and we recover these to
 943  * the unused_list here.
 944  *
 945  * Note that we don't do a wakeup here, but return a flag indicating
 946  * whether we got any buffer heads. A task ready to sleep can check
 947  * the returned value, and any tasks already sleeping will have been
 948  * awakened when the buffer heads were added to the reuse list.
 949  */
 950 static inline int recover_reusable_buffer_heads(void)
 951 {
 952         struct buffer_head *head = xchg(&reuse_list, NULL);
 953         int found = 0;
 954
 955         if (head) {
 956                 do {
 957                         struct buffer_head *bh = head;
 958                         head = head->b_next_free;
 959                         put_unused_buffer_head(bh);
 960                 } while (head);
 961                 found = 1;
 962         }
 963         return found;
 964 }
 965
 966 /*
 967  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
 968  * no-buffer-head deadlock.  Return NULL on failure; waiting for
 969  * buffer heads is now handled in create_buffers().
 970  */
 971 static struct buffer_head * get_unused_buffer_head(int async)
 972 {
 973         struct buffer_head * bh;
 974
 975         recover_reusable_buffer_heads();
 976         if (nr_unused_buffer_heads > NR_RESERVED) {
 977                 bh = unused_list;
 978                 unused_list = bh->b_next_free;
 979                 nr_unused_buffer_heads--;
 980                 return bh;
 981         }
 982
 983         /* This is critical.  We can't swap out pages to get
 984          * more buffer heads, because the swap-out may need
 985          * more buffer-heads itself.  Thus SLAB_BUFFER.
 986          */
 987         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
 988                 memset(bh, 0, sizeof(*bh));
 989                 init_waitqueue_head(&bh->b_wait);
 990                 nr_buffer_heads++;
 991                 return bh;
 992         }
 993
 994         /*
 995          * If we need an async buffer, use the reserved buffer heads.
 996          */
 997         if (async && unused_list) {
 998                 bh = unused_list;
 999                 unused_list = bh->b_next_free;
1000                 nr_unused_buffer_heads--;
1001                 return bh;
1002         }
1003
1004 #if 0
1005         /*
1006          * (Pending further analysis ...)
1007          * Ordinary (non-async) requests can use a different memory priority
1008          * to free up pages. Any swapping thus generated will use async
1009          * buffer heads.
1010          */
1011         if(!async &&
1012            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1013                 memset(bh, 0, sizeof(*bh));
1014                 init_waitqueue_head(&bh->b_wait);
1015                 nr_buffer_heads++;
1016                 return bh;
1017         }
1018 #endif
1019
1020         return NULL;
1021 }
1022
1023 /*
1024  * Create the appropriate buffers when given a page for data area and
1025  * the size of each buffer.. Use the bh->b_this_page linked list to
1026  * follow the buffers created.  Return NULL if unable to create more
1027  * buffers.
1028  * The async flag is used to differentiate async IO (paging, swapping)
1029  * from ordinary buffer allocations, and only async requests are allowed
1030  * to sleep waiting for buffer heads.
1031  */
1032 static struct buffer_head * create_buffers(unsigned long page,
1033                                                 unsigned long size, int async)
1034 {
1035         DECLARE_WAITQUEUE(wait, current);
1036         struct buffer_head *bh, *head;
1037         long offset;
1038
1039 try_again:
1040         head = NULL;
1041         offset = PAGE_SIZE;
1042         while ((offset -= size) >= 0) {
1043                 bh = get_unused_buffer_head(async);
1044                 if (!bh)
1045                         goto no_grow;
1046
1047                 bh->b_dev = B_FREE;  /* Flag as unused */
1048                 bh->b_this_page = head;
1049                 head = bh;
1050
1051                 bh->b_state = 0;
1052                 bh->b_next_free = NULL;
1053                 bh->b_count = 0;
1054                 bh->b_size = size;
1055
1056                 bh->b_data = (char *) (page+offset);
1057                 bh->b_list = 0;
1058         }
1059         return head;
1060 /*
1061  * In case anything failed, we just free everything we got.
1062  */
1063 no_grow:
1064         if (head) {
1065                 do {
1066                         bh = head;
1067                         head = head->b_this_page;
1068                         put_unused_buffer_head(bh);
1069                 } while (head);
1070
1071                 /* Wake up any waiters ... */
1072                 wake_up(&buffer_wait);
1073         }
1074
1075         /*
1076          * Return failure for non-async IO requests.  Async IO requests
1077          * are not allowed to fail, so we have to wait until buffer heads
1078          * become available.  But we don't want tasks sleeping with
1079          * partially complete buffers, so all were released above.
1080          */
1081         if (!async)
1082                 return NULL;
1083
1084         /* We're _really_ low on memory. Now we just
1085          * wait for old buffer heads to become free due to
1086          * finishing IO.  Since this is an async request and
1087          * the reserve list is empty, we're sure there are
1088          * async buffer heads in use.
1089          */
1090         run_task_queue(&tq_disk);
1091
1092         /*
1093          * Set our state for sleeping, then check again for buffer heads.
1094          * This ensures we won't miss a wake_up from an interrupt.
1095          */
1096         add_wait_queue(&buffer_wait, &wait);
1097         current->state = TASK_UNINTERRUPTIBLE;
1098         if (!recover_reusable_buffer_heads())
1099                 schedule();
1100         remove_wait_queue(&buffer_wait, &wait);
1101         current->state = TASK_RUNNING;
1102         goto try_again;
1103 }
1104
1105 /* Run the hooks that have to be done when a page I/O has completed. */
1106 static inline void after_unlock_page (struct page * page)
1107 {
1108         if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1109                 atomic_dec(&nr_async_pages);
1110 #ifdef DEBUG_SWAP
1111                 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1112                         (char *) page_address(page),
1113                         atomic_read(&nr_async_pages));
1114 #endif
1115         }
1116         if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1117                 swap_after_unlock_page(page->offset);
1118         if (test_and_clear_bit(PG_free_after, &page->flags))
1119                 __free_page(page);
1120 }
1121
1122 /*
1123  * Free all temporary buffers belonging to a page.
1124  * This needs to be called with interrupts disabled.
1125  */
1126 static inline void free_async_buffers (struct buffer_head * bh)
1127 {
1128         struct buffer_head *tmp, *tail;
1129
1130         /*
1131          * Link all the buffers into the b_next_free list,
1132          * so we only have to do one xchg() operation ...
1133          */
1134         tail = bh;
1135         while ((tmp = tail->b_this_page) != bh) {
1136                 tail->b_next_free = tmp;
1137                 tail = tmp;
1138         };
1139
1140         /* Update the reuse list */
1141         tail->b_next_free = xchg(&reuse_list, NULL);
1142         reuse_list = bh;
1143
1144         /* Wake up any waiters ... */
1145         wake_up(&buffer_wait);
1146 }
1147
1148 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1149 {
1150         unsigned long flags;
1151         struct buffer_head *tmp;
1152         struct page *page;
1153
1154         mark_buffer_uptodate(bh, uptodate);
1155         unlock_buffer(bh);
1156
1157         /* This is a temporary buffer used for page I/O. */
1158         page = mem_map + MAP_NR(bh->b_data);
1159         if (!PageLocked(page))
1160                 goto not_locked;
1161         if (bh->b_count != 1)
1162                 goto bad_count;
1163
1164         if (!test_bit(BH_Uptodate, &bh->b_state))
1165                 set_bit(PG_error, &page->flags);
1166
1167         /*
1168          * Be _very_ careful from here on. Bad things can happen if
1169          * two buffer heads end IO at almost the same time and both
1170          * decide that the page is now completely done.
1171          *
1172          * Async buffer_heads are here only as labels for IO, and get
1173          * thrown away once the IO for this page is complete.  IO is
1174          * deemed complete once all buffers have been visited
1175          * (b_count==0) and are now unlocked. We must make sure that
1176          * only the _last_ buffer that decrements its count is the one
1177          * that free's the page..
1178          */
1179         save_flags(flags);
1180         cli();
1181         bh->b_count--;
1182         tmp = bh;
1183         do {
1184                 if (tmp->b_count)
1185                         goto still_busy;
1186                 tmp = tmp->b_this_page;
1187         } while (tmp != bh);
1188
1189         /* OK, the async IO on this page is complete. */
1190         free_async_buffers(bh);
1191         restore_flags(flags);
1192         clear_bit(PG_locked, &page->flags);
1193         wake_up(&page->wait);
1194         after_unlock_page(page);
1195         return;
1196
1197 still_busy:
1198         restore_flags(flags);
1199         return;
1200
1201 not_locked:
1202         printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1203         return;
1204
1205 bad_count:
1206         printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1207         return;
1208 }
1209
1210 /*
1211  * Start I/O on a page.
1212  * This function expects the page to be locked and may return before I/O is complete.
1213  * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1214  */
1215 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1216 {
1217         struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1218         int block, nr;
1219
1220         if (!PageLocked(page))
1221                 panic("brw_page: page not locked for I/O");
1222         clear_bit(PG_uptodate, &page->flags);
1223         clear_bit(PG_error, &page->flags);
1224         /*
1225          * Allocate async buffer heads pointing to this page, just for I/O.
1226          * They do _not_ show up in the buffer hash table!
1227          * They are _not_ registered in page->buffers either!
1228          */
1229         bh = create_buffers(page_address(page), size, 1);
1230         if (!bh) {
1231                 /* WSH: exit here leaves page->count incremented */
1232                 clear_bit(PG_locked, &page->flags);
1233                 wake_up(&page->wait);
1234                 return -ENOMEM;
1235         }
1236         nr = 0;
1237         next = bh;
1238         do {
1239                 struct buffer_head * tmp;
1240                 block = *(b++);
1241
1242                 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1243                 set_bit(BH_Uptodate, &next->b_state);
1244
1245                 /*
1246                  * When we use bmap, we define block zero to represent
1247                  * a hole.  ll_rw_page, however, may legitimately
1248                  * access block zero, and we need to distinguish the
1249                  * two cases.
1250                  */
1251                 if (bmap && !block) {
1252                         memset(next->b_data, 0, size);
1253                         next->b_count--;
1254                         continue;
1255                 }
1256                 tmp = get_hash_table(dev, block, size);
1257                 if (tmp) {
1258                         if (!buffer_uptodate(tmp)) {
1259                                 if (rw == READ)
1260                                         ll_rw_block(READ, 1, &tmp);
1261                                 wait_on_buffer(tmp);
1262                         }
1263                         if (rw == READ)
1264                                 memcpy(next->b_data, tmp->b_data, size);
1265                         else {
1266                                 memcpy(tmp->b_data, next->b_data, size);
1267                                 mark_buffer_dirty(tmp, 0);
1268                         }
1269                         brelse(tmp);
1270                         next->b_count--;
1271                         continue;
1272                 }
1273                 if (rw == READ)
1274                         clear_bit(BH_Uptodate, &next->b_state);
1275                 else
1276                         set_bit(BH_Dirty, &next->b_state);
1277                 arr[nr++] = next;
1278         } while (prev = next, (next = next->b_this_page) != NULL);
1279         prev->b_this_page = bh;
1280
1281         if (nr) {
1282                 ll_rw_block(rw, nr, arr);
1283                 /* The rest of the work is done in mark_buffer_uptodate()
1284                  * and unlock_buffer(). */
1285         } else {
1286                 unsigned long flags;
1287                 clear_bit(PG_locked, &page->flags);
1288                 set_bit(PG_uptodate, &page->flags);
1289                 wake_up(&page->wait);
1290                 save_flags(flags);
1291                 cli();
1292                 free_async_buffers(bh);
1293                 restore_flags(flags);
1294                 after_unlock_page(page);
1295         }
1296         ++current->maj_flt;
1297         return 0;
1298 }
1299
1300 /*
1301  * This is called by end_request() when I/O has completed.
1302  */
1303 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1304 {
1305         if (on) {
1306                 struct buffer_head *tmp = bh;
1307                 set_bit(BH_Uptodate, &bh->b_state);
1308                 /* If a page has buffers and all these buffers are uptodate,
1309                  * then the page is uptodate. */
1310                 do {
1311                         if (!test_bit(BH_Uptodate, &tmp->b_state))
1312                                 return;
1313                         tmp=tmp->b_this_page;
1314                 } while (tmp && tmp != bh);
1315                 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1316                 return;
1317         }
1318         clear_bit(BH_Uptodate, &bh->b_state);
1319 }
1320
1321 /*
1322  * Generic "readpage" function for block devices that have the normal
1323  * bmap functionality. This is most of the block device filesystems.
1324  * Reads the page asynchronously --- the unlock_buffer() and
1325  * mark_buffer_uptodate() functions propagate buffer state into the
1326  * page struct once IO has completed.
1327  */
1328 int generic_readpage(struct file * file, struct page * page)
1329 {
1330         struct dentry *dentry = file->f_dentry;
1331         struct inode *inode = dentry->d_inode;
1332         unsigned long block;
1333         int *p, nr[PAGE_SIZE/512];
1334         int i;
1335
1336         atomic_inc(&page->count);
1337         set_bit(PG_locked, &page->flags);
1338         set_bit(PG_free_after, &page->flags);
1339
1340         i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1341         block = page->offset >> inode->i_sb->s_blocksize_bits;
1342         p = nr;
1343         do {
1344                 *p = inode->i_op->bmap(inode, block);
1345                 i--;
1346                 block++;
1347                 p++;
1348         } while (i > 0);
1349
1350         /* IO start */
1351         brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1352         return 0;
1353 }
1354
1355 /*
1356  * Try to increase the number of buffers available: the size argument
1357  * is used to determine what kind of buffers we want.
1358  */
1359 static int grow_buffers(int size)
1360 {
1361         unsigned long page;
1362         struct buffer_head *bh, *tmp;
1363         struct buffer_head * insert_point;
1364         int isize;
1365
1366         if ((size & 511) || (size > PAGE_SIZE)) {
1367                 printk("VFS: grow_buffers: size = %d\n",size);
1368                 return 0;
1369         }
1370
1371         if (!(page = __get_free_page(GFP_BUFFER)))
1372                 return 0;
1373         bh = create_buffers(page, size, 0);
1374         if (!bh) {
1375                 free_page(page);
1376                 return 0;
1377         }
1378
1379         isize = BUFSIZE_INDEX(size);
1380         insert_point = free_list[isize];
1381
1382         tmp = bh;
1383         while (1) {
1384                 if (insert_point) {
1385                         tmp->b_next_free = insert_point->b_next_free;
1386                         tmp->b_prev_free = insert_point;
1387                         insert_point->b_next_free->b_prev_free = tmp;
1388                         insert_point->b_next_free = tmp;
1389                 } else {
1390                         tmp->b_prev_free = tmp;
1391                         tmp->b_next_free = tmp;
1392                 }
1393                 insert_point = tmp;
1394                 ++nr_buffers;
1395                 if (tmp->b_this_page)
1396                         tmp = tmp->b_this_page;
1397                 else
1398                         break;
1399         }
1400         tmp->b_this_page = bh;
1401         free_list[isize] = bh;
1402         mem_map[MAP_NR(page)].buffers = bh;
1403         buffermem += PAGE_SIZE;
1404         return 1;
1405 }
1406
1407 /*
1408  * Can the buffer be thrown out?
1409  */
1410 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1411 #define buffer_busy(bh)         ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1412
1413 /*
1414  * try_to_free_buffers() checks if all the buffers on this particular page
1415  * are unused, and free's the page if so.
1416  *
1417  * Wake up bdflush() if this fails - if we're running low on memory due
1418  * to dirty buffers, we need to flush them out as quickly as possible.
1419  */
1420 int try_to_free_buffers(struct page * page_map)
1421 {
1422         struct buffer_head * tmp, * bh = page_map->buffers;
1423
1424         tmp = bh;
1425         do {
1426                 struct buffer_head * p = tmp;
1427
1428                 tmp = tmp->b_this_page;
1429                 if (!buffer_busy(p))
1430                         continue;
1431
1432                 wakeup_bdflush(0);
1433                 return 0;
1434         } while (tmp != bh);
1435
1436         tmp = bh;
1437         do {
1438                 struct buffer_head * p = tmp;
1439                 tmp = tmp->b_this_page;
1440                 nr_buffers--;
1441                 remove_from_queues(p);
1442                 put_unused_buffer_head(p);
1443         } while (tmp != bh);
1444
1445         /* Wake up anyone waiting for buffer heads */
1446         wake_up(&buffer_wait);
1447
1448         /* And free the page */
1449         buffermem -= PAGE_SIZE;
1450         page_map->buffers = NULL;
1451         __free_page(page_map);
1452         return 1;
1453 }
1454
1455 /* ================== Debugging =================== */
1456
1457 void show_buffers(void)
1458 {
1459         struct buffer_head * bh;
1460         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1461         int protected = 0;
1462         int nlist;
1463         static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1464
1465         printk("Buffer memory:   %6dkB\n",buffermem>>10);
1466         printk("Buffer heads:    %6d\n",nr_buffer_heads);
1467         printk("Buffer blocks:   %6d\n",nr_buffers);
1468         printk("Buffer hashed:   %6d\n",nr_hashed_buffers);
1469
1470         for(nlist = 0; nlist < NR_LIST; nlist++) {
1471           found = locked = dirty = used = lastused = protected = 0;
1472           bh = lru_list[nlist];
1473           if(!bh) continue;
1474
1475           do {
1476                 found++;
1477                 if (buffer_locked(bh))
1478                         locked++;
1479                 if (buffer_protected(bh))
1480                         protected++;
1481                 if (buffer_dirty(bh))
1482                         dirty++;
1483                 if (bh->b_count)
1484                         used++, lastused = found;
1485                 bh = bh->b_next_free;
1486           } while (bh != lru_list[nlist]);
1487           printk("%8s: %d buffers, %d used (last=%d), "
1488                  "%d locked, %d protected, %d dirty\n",
1489                  buf_types[nlist], found, used, lastused,
1490                  locked, protected, dirty);
1491         };
1492 }
1493
1494
1495 /* ===================== Init ======================= */
1496
1497 /*
1498  * allocate the hash table and init the free list
1499  * Use gfp() for the hash table to decrease TLB misses, use
1500  * SLAB cache for buffer heads.
1501  */
1502 void __init buffer_init(unsigned long memory_size)
1503 {
1504         int order;
1505         unsigned int nr_hash;
1506
1507         /* we need to guess at the right sort of size for a buffer cache.
1508            the heuristic from working with large databases and getting
1509            fsync times (ext2) manageable, is the following */
1510
1511         memory_size >>= 20;
1512         for (order = 5; (1UL << order) < memory_size; order++);
1513
1514         /* try to allocate something until we get it or we're asking
1515            for something that is really too small */
1516
1517         do {
1518                 nr_hash = (1UL << order) * PAGE_SIZE /
1519                     sizeof(struct buffer_head *);
1520                 hash_table = (struct buffer_head **)
1521                     __get_free_pages(GFP_ATOMIC, order);
1522         } while (hash_table == NULL && --order > 4);
1523
1524         if (!hash_table)
1525                 panic("Failed to allocate buffer hash table\n");
1526         memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1527         bh_hash_mask = nr_hash-1;
1528
1529         bh_cachep = kmem_cache_create("buffer_head",
1530                                       sizeof(struct buffer_head),
1531                                       0,
1532                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1533         if(!bh_cachep)
1534                 panic("Cannot create buffer head SLAB cache\n");
1535         /*
1536          * Allocate the reserved buffer heads.
1537          */
1538         while (nr_buffer_heads < NR_RESERVED) {
1539                 struct buffer_head * bh;
1540
1541                 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1542                 if (!bh)
1543                         break;
1544                 put_unused_buffer_head(bh);
1545                 nr_buffer_heads++;
1546         }
1547
1548         lru_list[BUF_CLEAN] = 0;
1549         grow_buffers(BLOCK_SIZE);
1550 }
1551
1552
1553 /* ====================== bdflush support =================== */
1554
1555 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1556  * response to dirty buffers.  Once this process is activated, we write back
1557  * a limited number of buffers to the disks and then go back to sleep again.
1558  */
1559 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1560 struct task_struct *bdflush_tsk = 0;
1561
1562 void wakeup_bdflush(int wait)
1563 {
1564         if (current == bdflush_tsk)
1565                 return;
1566         wake_up_process(bdflush_tsk);
1567         if (wait) {
1568                 run_task_queue(&tq_disk);
1569                 sleep_on(&bdflush_done);
1570         }
1571 }
1572
1573
1574 /*
1575  * Here we attempt to write back old buffers.
1576  * To prevent deadlocks for a loop device:
1577  * 1) Do non-blocking writes to loop (avoids deadlock with running
1578  *      out of request blocks).
1579  * 2) But do a blocking write if the only dirty buffers are loop buffers
1580  *      (otherwise we go into an infinite busy-loop).
1581  * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1582  *      with running out of free buffers for loop's "real" device).
1583 */
1584
1585 static inline void sync_old_buffers(void)
1586 {
1587         int i;
1588         int ndirty = 0;
1589         int wrta_cmd = WRITEA;
1590 #ifdef DEBUG
1591         int ncount = 0, nwritten = 0;
1592 #endif
1593         struct buffer_head * bh, *next;
1594
1595 #ifdef DEBUG
1596         bh = lru_list[BUF_CLEAN];
1597         if(bh)
1598                 for(i = nr_buffers_type[BUF_CLEAN]; --i > 0; bh = next) {
1599                         next = bh->b_next_free;
1600
1601                         /* Dirty/locked buffer on clean list?  Refile it */
1602                         if (buffer_locked(bh) || buffer_dirty(bh)) {
1603                                 ncount++;
1604                                 refile_buffer(bh);
1605                         }
1606                 }
1607 #endif
1608
1609         bh = lru_list[BUF_LOCKED];
1610         if(bh)
1611                 for(i = nr_buffers_type[BUF_LOCKED]; --i > 0; bh = next) {
1612                         next = bh->b_next_free;
1613
1614                         /* Unlocked buffer on locked list?  Refile it */
1615                         if (!buffer_locked(bh))
1616                                 refile_buffer(bh);
1617                 }
1618
1619  restart:
1620         bh = lru_list[BUF_DIRTY];
1621         if(bh)
1622                 for (i = nr_buffers_type[BUF_DIRTY];
1623                      i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1624                      bh = next) {
1625                         /* We may have stalled while waiting for
1626                            I/O to complete. */
1627                         if(bh->b_list != BUF_DIRTY)
1628                                 goto restart;
1629                         next = bh->b_next_free;
1630                         if(!lru_list[BUF_DIRTY]) {
1631                                 printk("Dirty list empty %d\n", i);
1632                                 break;
1633                         }
1634
1635                         /* Clean buffer on dirty list?  Refile it */
1636                         if (!buffer_dirty(bh)) {
1637                                 refile_buffer(bh);
1638                                 continue;
1639                         }
1640
1641                         if (buffer_locked(bh))
1642                                 continue;
1643                         /* Should we write back buffers that are
1644                            shared or not??  Currently dirty buffers
1645                            are not shared, so it does not matter */
1646                         next->b_count++;
1647                         bh->b_count++;
1648                         ndirty++;
1649                         bh->b_flushtime = 0;
1650                         if (MAJOR(bh->b_dev) == LOOP_MAJOR) {
1651                                 ll_rw_block(wrta_cmd,1, &bh);
1652                                 wrta_cmd = WRITEA;
1653                                 if (buffer_dirty(bh))
1654                                         --ndirty;
1655                         }
1656                         else
1657                                 ll_rw_block(WRITE, 1, &bh);
1658                         bh->b_count--;
1659                         next->b_count--;
1660                 }
1661         /* If we didn't write anything, but there are still
1662          * dirty buffers, then make the next write to a
1663          * loop device to be a blocking write.
1664          * This lets us block--which we _must_ do! */
1665         if (ndirty == 0
1666             && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1667                 wrta_cmd = WRITE;
1668                 goto restart;
1669         }
1670
1671 #ifdef DEBUG
1672         if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1673         printk("wrote %d/%d buffers...", nwritten, ndirty);
1674 #endif
1675         run_task_queue(&tq_disk);
1676 }
1677
1678
1679 /* This is the interface to bdflush.  As we get more sophisticated, we can
1680  * pass tuning parameters to this "process", to adjust how it behaves.
1681  * We would want to verify each parameter, however, to make sure that it
1682  * is reasonable. */
1683
1684 asmlinkage int sys_bdflush(int func, long data)
1685 {
1686         int i, error = -EPERM;
1687
1688         lock_kernel();
1689         if (!capable(CAP_SYS_ADMIN))
1690                 goto out;
1691
1692         if (func == 1)
1693                 /* Func 1 used to call sync_old_buffers; a user space
1694                    daemon would call it periodically.  This is no
1695                    longer necessary.  Returning -EPERM here makes the
1696                    daemon silently exit.  */
1697                 goto out;
1698
1699         /* Basically func 1 means read param 1, 2 means write param 1, etc */
1700         if (func >= 2) {
1701                 i = (func-2) >> 1;
1702                 error = -EINVAL;
1703                 if (i < 0 || i >= N_PARAM)
1704                         goto out;
1705                 if((func & 1) == 0) {
1706                         error = put_user(bdf_prm.data[i], (int*)data);
1707                         goto out;
1708                 }
1709                 if (data < bdflush_min[i] || data > bdflush_max[i])
1710                         goto out;
1711                 bdf_prm.data[i] = data;
1712                 error = 0;
1713                 goto out;
1714         };
1715
1716         /* Having func 0 used to launch the actual bdflush and then never
1717          * return (unless explicitly killed). We return zero here to
1718          * remain semi-compatible with present update(8) programs.
1719          */
1720         error = 0;
1721 out:
1722         unlock_kernel();
1723         return error;
1724 }
1725
1726 /* This is the actual bdflush daemon itself. It used to be started
1727  * from the syscall above, but now we launch it ourselves internally
1728  * with kernel_thread(...)  directly after the first thread in
1729  * init/main.c.  Every so often, or when woken up by another task that
1730  * needs memory, we call sync_old_buffers to partially clear the dirty list.
1731  */
1732
1733 int bdflush(void * unused)
1734 {
1735         long remaining = HZ * bdf_prm.b_un.interval;
1736         struct task_struct *tsk = current;
1737
1738         /*
1739          *      We have a bare-bones task_struct, and really should fill
1740          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
1741          *      display semi-sane things. Not real crucial though...
1742          */
1743
1744         tsk->session = 1;
1745         tsk->pgrp = 1;
1746         tsk->dumpable = 0;  /* inhibit ptrace() */
1747         strcpy(tsk->comm, "kflushd");
1748         sigfillset(&tsk->blocked);
1749         bdflush_tsk = tsk;
1750
1751         /*
1752          *      As a kernel thread we want to tamper with system buffers
1753          *      and other internals and thus be subject to the SMP locking
1754          *      rules. (On a uniprocessor box this does nothing).
1755          */
1756         lock_kernel();
1757
1758         for (;;) {
1759                 tsk->state = TASK_INTERRUPTIBLE;
1760                 remaining = schedule_timeout(remaining);
1761
1762 #ifdef DEBUG
1763                 printk("bdflush() activated...");
1764 #endif
1765                 CHECK_EMERGENCY_SYNC
1766
1767                 if (remaining == 0) {
1768                         /*
1769                          * Also try to flush inodes and supers, since
1770                          * otherwise there would be no way of ensuring
1771                          * that these quantities ever get written
1772                          * back.  Ideally, we would have a timestamp
1773                          * on the inodes and superblocks so that we
1774                          * could write back only the old ones.
1775                          */
1776                         sync_supers(0);
1777                         sync_inodes(0);
1778                         remaining = HZ * bdf_prm.b_un.interval;
1779                 }
1780
1781                 /* Keep flushing till there aren't very many dirty buffers */
1782                 do {
1783                         sync_old_buffers();
1784                 } while(nr_buffers_type[BUF_DIRTY] > nr_buffers * bdf_prm.b_un.nfract/100);
1785
1786                 wake_up(&bdflush_done);
1787 #ifdef DEBUG
1788                 printk("sleeping again.\n");
1789 #endif
1790         }
1791 }