fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 #include <linux/malloc.h>
  28 #include <linux/locks.h>
  29 #include <linux/errno.h>
  30 #include <linux/swap.h>
  31 #include <linux/swapctl.h>
  32 #include <linux/smp_lock.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/blkdev.h>
  35 #include <linux/sysrq.h>
  36 #include <linux/file.h>
  37 #include <linux/init.h>
  38 #include <linux/quotaops.h>
  39
  40 #include <asm/uaccess.h>
  41 #include <asm/io.h>
  42 #include <asm/bitops.h>
  43
  44 #define NR_SIZES 7
  45 static char buffersize_index[65] =
  46 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  47   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  48   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  49  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  50   6};
  51
  52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  56                                              number of unused buffer heads */
  57
  58 /*
  59  * Hash table mask..
  60  */
  61 static unsigned long bh_hash_mask = 0;
  62
  63 static int grow_buffers(int size);
  64
  65 static struct buffer_head ** hash_table;
  66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
  67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
  68
  69 static kmem_cache_t *bh_cachep;
  70
  71 static struct buffer_head * unused_list = NULL;
  72 static struct buffer_head * reuse_list = NULL;
  73 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  74
  75 static int nr_buffers = 0;
  76 static int nr_buffers_type[NR_LIST] = {0,};
  77 static int nr_buffer_heads = 0;
  78 static int nr_unused_buffer_heads = 0;
  79 static int nr_hashed_buffers = 0;
  80
  81 /* This is used by some architectures to estimate available memory. */
  82 int buffermem = 0;
  83
  84 /* Here is the parameter block for the bdflush process. If you add or
  85  * remove any of the parameters, make sure to update kernel/sysctl.c.
  86  */
  87
  88 #define N_PARAM 9
  89
  90 /* The dummy values in this structure are left in there for compatibility
  91  * with old programs that play with the /proc entries.
  92  */
  93 union bdflush_param{
  94         struct {
  95                 int nfract;  /* Percentage of buffer cache dirty to
  96                                 activate bdflush */
  97                 int ndirty;  /* Maximum number of dirty blocks to write out per
  98                                 wake-cycle */
  99                 int nrefill; /* Number of clean buffers to try to obtain
 100                                 each time we call refill */
 101                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 102                                   when trying to refill buffers. */
 103                 int dummy1;    /* unused */
 104                 int age_buffer;  /* Time for normal buffer to age before
 105                                     we flush it */
 106                 int age_super;  /* Time for superblock to age before we
 107                                    flush it */
 108                 int dummy2;    /* unused */
 109                 int dummy3;    /* unused */
 110         } b_un;
 111         unsigned int data[N_PARAM];
 112 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
 113
 114 /* These are the min and max parameter values that we will allow to be assigned */
 115 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 116 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
 117
 118 void wakeup_bdflush(int);
 119
 120 /*
 121  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 122  * and getting rid of the cli-sti pairs. The wait-queue routines still
 123  * need cli-sti, but now it's just a couple of 386 instructions or so.
 124  *
 125  * Note that the real wait_on_buffer() is an inline function that checks
 126  * if 'b_wait' is set before calling this, so that the queues aren't set
 127  * up unnecessarily.
 128  */
 129 void __wait_on_buffer(struct buffer_head * bh)
 130 {
 131         struct task_struct *tsk = current;
 132         DECLARE_WAITQUEUE(wait, tsk);
 133
 134         bh->b_count++;
 135         add_wait_queue(&bh->b_wait, &wait);
 136 repeat:
 137         tsk->state = TASK_UNINTERRUPTIBLE;
 138         run_task_queue(&tq_disk);
 139         if (buffer_locked(bh)) {
 140                 schedule();
 141                 goto repeat;
 142         }
 143         tsk->state = TASK_RUNNING;
 144         remove_wait_queue(&bh->b_wait, &wait);
 145         bh->b_count--;
 146 }
 147
 148 /* Call sync_buffers with wait!=0 to ensure that the call does not
 149  * return until all buffer writes have completed.  Sync() may return
 150  * before the writes have finished; fsync() may not.
 151  */
 152
 153 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 154  * spontaneously dirty themselves without ever brelse being called.
 155  * We will ultimately want to put these in a separate list, but for
 156  * now we search all of the lists for dirty buffers.
 157  */
 158 static int sync_buffers(kdev_t dev, int wait)
 159 {
 160         int i, retry, pass = 0, err = 0;
 161         struct buffer_head * bh, *next;
 162
 163         /* One pass for no-wait, three for wait:
 164          * 0) write out all dirty, unlocked buffers;
 165          * 1) write out all dirty buffers, waiting if locked;
 166          * 2) wait for completion by waiting for all buffers to unlock.
 167          */
 168         do {
 169                 retry = 0;
 170 repeat:
 171                 /* We search all lists as a failsafe mechanism, not because we expect
 172                  * there to be dirty buffers on any of the other lists.
 173                  */
 174                 bh = lru_list[BUF_DIRTY];
 175                 if (!bh)
 176                         goto repeat2;
 177                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 178                         if (bh->b_list != BUF_DIRTY)
 179                                 goto repeat;
 180                         next = bh->b_next_free;
 181                         if (!lru_list[BUF_DIRTY])
 182                                 break;
 183                         if (dev && bh->b_dev != dev)
 184                                 continue;
 185                         if (buffer_locked(bh)) {
 186                                 /* Buffer is locked; skip it unless wait is
 187                                  * requested AND pass > 0.
 188                                  */
 189                                 if (!wait || !pass) {
 190                                         retry = 1;
 191                                         continue;
 192                                 }
 193                                 wait_on_buffer (bh);
 194                                 goto repeat;
 195                         }
 196
 197                         /* If an unlocked buffer is not uptodate, there has
 198                          * been an IO error. Skip it.
 199                          */
 200                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 201                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 202                                 err = -EIO;
 203                                 continue;
 204                         }
 205
 206                         /* Don't write clean buffers.  Don't write ANY buffers
 207                          * on the third pass.
 208                          */
 209                         if (!buffer_dirty(bh) || pass >= 2)
 210                                 continue;
 211
 212                         /* Don't bother about locked buffers.
 213                          *
 214                          * XXX We checked if it was locked above and there is no
 215                          * XXX way we could have slept in between. -DaveM
 216                          */
 217                         if (buffer_locked(bh))
 218                                 continue;
 219                         bh->b_count++;
 220                         next->b_count++;
 221                         bh->b_flushtime = 0;
 222                         ll_rw_block(WRITE, 1, &bh);
 223                         bh->b_count--;
 224                         next->b_count--;
 225                         retry = 1;
 226                 }
 227
 228     repeat2:
 229                 bh = lru_list[BUF_LOCKED];
 230                 if (!bh)
 231                         break;
 232                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 233                         if (bh->b_list != BUF_LOCKED)
 234                                 goto repeat2;
 235                         next = bh->b_next_free;
 236                         if (!lru_list[BUF_LOCKED])
 237                                 break;
 238                         if (dev && bh->b_dev != dev)
 239                                 continue;
 240                         if (buffer_locked(bh)) {
 241                                 /* Buffer is locked; skip it unless wait is
 242                                  * requested AND pass > 0.
 243                                  */
 244                                 if (!wait || !pass) {
 245                                         retry = 1;
 246                                         continue;
 247                                 }
 248                                 wait_on_buffer (bh);
 249                                 goto repeat2;
 250                         }
 251                 }
 252
 253                 /* If we are waiting for the sync to succeed, and if any dirty
 254                  * blocks were written, then repeat; on the second pass, only
 255                  * wait for buffers being written (do not pass to write any
 256                  * more buffers on the second pass).
 257                  */
 258         } while (wait && retry && ++pass<=2);
 259         return err;
 260 }
 261
 262 void sync_dev(kdev_t dev)
 263 {
 264         sync_buffers(dev, 0);
 265         sync_supers(dev);
 266         sync_inodes(dev);
 267         sync_buffers(dev, 0);
 268         DQUOT_SYNC(dev);
 269         /*
 270          * FIXME(eric) we need to sync the physical devices here.
 271          * This is because some (scsi) controllers have huge amounts of
 272          * cache onboard (hundreds of Mb), and we need to instruct
 273          * them to commit all of the dirty memory to disk, and we should
 274          * not return until this has happened.
 275          *
 276          * This would need to get implemented by going through the assorted
 277          * layers so that each block major number can be synced, and this
 278          * would call down into the upper and mid-layer scsi.
 279          */
 280 }
 281
 282 int fsync_dev(kdev_t dev)
 283 {
 284         sync_buffers(dev, 0);
 285         sync_supers(dev);
 286         sync_inodes(dev);
 287         DQUOT_SYNC(dev);
 288         return sync_buffers(dev, 1);
 289 }
 290
 291 asmlinkage int sys_sync(void)
 292 {
 293         lock_kernel();
 294         fsync_dev(0);
 295         unlock_kernel();
 296         return 0;
 297 }
 298
 299 /*
 300  *      filp may be NULL if called via the msync of a vma.
 301  */
 302
 303 int file_fsync(struct file *filp, struct dentry *dentry)
 304 {
 305         struct inode * inode = dentry->d_inode;
 306         struct super_block * sb;
 307         kdev_t dev;
 308
 309         /* sync the inode to buffers */
 310         write_inode_now(inode);
 311
 312         /* sync the superblock to buffers */
 313         sb = inode->i_sb;
 314         wait_on_super(sb);
 315         if (sb->s_op && sb->s_op->write_super)
 316                 sb->s_op->write_super(sb);
 317
 318         /* .. finally sync the buffers to disk */
 319         dev = inode->i_dev;
 320         return sync_buffers(dev, 1);
 321 }
 322
 323 asmlinkage int sys_fsync(unsigned int fd)
 324 {
 325         struct file * file;
 326         struct dentry * dentry;
 327         struct inode * inode;
 328         int err;
 329
 330         lock_kernel();
 331         err = -EBADF;
 332         file = fget(fd);
 333         if (!file)
 334                 goto out;
 335
 336         dentry = file->f_dentry;
 337         if (!dentry)
 338                 goto out_putf;
 339
 340         inode = dentry->d_inode;
 341         if (!inode)
 342                 goto out_putf;
 343
 344         err = -EINVAL;
 345         if (!file->f_op || !file->f_op->fsync)
 346                 goto out_putf;
 347
 348         /* We need to protect against concurrent writers.. */
 349         down(&inode->i_sem);
 350         err = file->f_op->fsync(file, dentry);
 351         up(&inode->i_sem);
 352
 353 out_putf:
 354         fput(file);
 355 out:
 356         unlock_kernel();
 357         return err;
 358 }
 359
 360 asmlinkage int sys_fdatasync(unsigned int fd)
 361 {
 362         struct file * file;
 363         struct dentry * dentry;
 364         struct inode * inode;
 365         int err;
 366
 367         lock_kernel();
 368         err = -EBADF;
 369         file = fget(fd);
 370         if (!file)
 371                 goto out;
 372
 373         dentry = file->f_dentry;
 374         if (!dentry)
 375                 goto out_putf;
 376
 377         inode = dentry->d_inode;
 378         if (!inode)
 379                 goto out_putf;
 380
 381         err = -EINVAL;
 382         if (!file->f_op || !file->f_op->fsync)
 383                 goto out_putf;
 384
 385         /* this needs further work, at the moment it is identical to fsync() */
 386         down(&inode->i_sem);
 387         err = file->f_op->fsync(file, dentry);
 388         up(&inode->i_sem);
 389
 390 out_putf:
 391         fput(file);
 392 out:
 393         unlock_kernel();
 394         return err;
 395 }
 396
 397 void invalidate_buffers(kdev_t dev)
 398 {
 399         int i;
 400         int nlist;
 401         struct buffer_head * bh;
 402
 403         for(nlist = 0; nlist < NR_LIST; nlist++) {
 404                 bh = lru_list[nlist];
 405                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 406                         if (bh->b_dev != dev)
 407                                 continue;
 408                         wait_on_buffer(bh);
 409                         if (bh->b_dev != dev)
 410                                 continue;
 411                         if (bh->b_count)
 412                                 continue;
 413                         bh->b_flushtime = 0;
 414                         clear_bit(BH_Protected, &bh->b_state);
 415                         clear_bit(BH_Uptodate, &bh->b_state);
 416                         clear_bit(BH_Dirty, &bh->b_state);
 417                         clear_bit(BH_Req, &bh->b_state);
 418                 }
 419         }
 420 }
 421
 422 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
 423 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 424
 425 static inline void remove_from_hash_queue(struct buffer_head * bh)
 426 {
 427         struct buffer_head **pprev = bh->b_pprev;
 428         if (pprev) {
 429                 struct buffer_head * next = bh->b_next;
 430                 if (next) {
 431                         next->b_pprev = pprev;
 432                         bh->b_next = NULL;
 433                 }
 434                 *pprev = next;
 435                 bh->b_pprev = NULL;
 436         }
 437         nr_hashed_buffers--;
 438 }
 439
 440 static inline void remove_from_lru_list(struct buffer_head * bh)
 441 {
 442         if (!(bh->b_prev_free) || !(bh->b_next_free))
 443                 panic("VFS: LRU block list corrupted");
 444         if (bh->b_dev == B_FREE)
 445                 panic("LRU list corrupted");
 446         bh->b_prev_free->b_next_free = bh->b_next_free;
 447         bh->b_next_free->b_prev_free = bh->b_prev_free;
 448
 449         if (lru_list[bh->b_list] == bh)
 450                  lru_list[bh->b_list] = bh->b_next_free;
 451         if (lru_list[bh->b_list] == bh)
 452                  lru_list[bh->b_list] = NULL;
 453         bh->b_next_free = bh->b_prev_free = NULL;
 454 }
 455
 456 static inline void remove_from_free_list(struct buffer_head * bh)
 457 {
 458         int isize = BUFSIZE_INDEX(bh->b_size);
 459         if (!(bh->b_prev_free) || !(bh->b_next_free))
 460                 panic("VFS: Free block list corrupted");
 461         if(bh->b_dev != B_FREE)
 462                 panic("Free list corrupted");
 463         if(!free_list[isize])
 464                 panic("Free list empty");
 465         if(bh->b_next_free == bh)
 466                  free_list[isize] = NULL;
 467         else {
 468                 bh->b_prev_free->b_next_free = bh->b_next_free;
 469                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 470                 if (free_list[isize] == bh)
 471                          free_list[isize] = bh->b_next_free;
 472         }
 473         bh->b_next_free = bh->b_prev_free = NULL;
 474 }
 475
 476 static void remove_from_queues(struct buffer_head * bh)
 477 {
 478         if(bh->b_dev == B_FREE) {
 479                 remove_from_free_list(bh); /* Free list entries should not be
 480                                               in the hash queue */
 481                 return;
 482         }
 483         nr_buffers_type[bh->b_list]--;
 484         remove_from_hash_queue(bh);
 485         remove_from_lru_list(bh);
 486 }
 487
 488 static inline void put_last_free(struct buffer_head * bh)
 489 {
 490         if (bh) {
 491                 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 492
 493                 bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 494
 495                 /* Add to back of free list. */
 496                 if(!*bhp) {
 497                         *bhp = bh;
 498                         bh->b_prev_free = bh;
 499                 }
 500
 501                 bh->b_next_free = *bhp;
 502                 bh->b_prev_free = (*bhp)->b_prev_free;
 503                 (*bhp)->b_prev_free->b_next_free = bh;
 504                 (*bhp)->b_prev_free = bh;
 505         }
 506 }
 507
 508 static void insert_into_queues(struct buffer_head * bh)
 509 {
 510         /* put at end of free list */
 511         if(bh->b_dev == B_FREE) {
 512                 put_last_free(bh);
 513         } else {
 514                 struct buffer_head **bhp = &lru_list[bh->b_list];
 515
 516                 if(!*bhp) {
 517                         *bhp = bh;
 518                         bh->b_prev_free = bh;
 519                 }
 520
 521                 if (bh->b_next_free)
 522                         panic("VFS: buffer LRU pointers corrupted");
 523
 524                 bh->b_next_free = *bhp;
 525                 bh->b_prev_free = (*bhp)->b_prev_free;
 526                 (*bhp)->b_prev_free->b_next_free = bh;
 527                 (*bhp)->b_prev_free = bh;
 528
 529                 nr_buffers_type[bh->b_list]++;
 530
 531                 /* Put the buffer in new hash-queue if it has a device. */
 532                 bh->b_next = NULL;
 533                 bh->b_pprev = NULL;
 534                 if (bh->b_dev) {
 535                         struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
 536                         struct buffer_head *next = *bhp;
 537
 538                         if (next) {
 539                                 bh->b_next = next;
 540                                 next->b_pprev = &bh->b_next;
 541                         }
 542                         *bhp = bh;
 543                         bh->b_pprev = bhp;
 544                 }
 545                 nr_hashed_buffers++;
 546         }
 547 }
 548
 549 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 550 {
 551         struct buffer_head * next;
 552
 553         next = hash(dev,block);
 554         for (;;) {
 555                 struct buffer_head *tmp = next;
 556                 if (!next)
 557                         break;
 558                 next = tmp->b_next;
 559                 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
 560                         continue;
 561                 next = tmp;
 562                 break;
 563         }
 564         return next;
 565 }
 566
 567 /*
 568  * Why like this, I hear you say... The reason is race-conditions.
 569  * As we don't lock buffers (unless we are reading them, that is),
 570  * something might happen to it while we sleep (ie a read-error
 571  * will force it bad). This shouldn't really happen currently, but
 572  * the code is ready.
 573  */
 574 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 575 {
 576         struct buffer_head * bh;
 577         bh = find_buffer(dev,block,size);
 578         if (bh)
 579                 bh->b_count++;
 580         return bh;
 581 }
 582
 583 unsigned int get_hardblocksize(kdev_t dev)
 584 {
 585         /*
 586          * Get the hard sector size for the given device.  If we don't know
 587          * what it is, return 0.
 588          */
 589         if (hardsect_size[MAJOR(dev)] != NULL) {
 590                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 591                 if (blksize != 0)
 592                         return blksize;
 593         }
 594
 595         /*
 596          * We don't know what the hardware sector size for this device is.
 597          * Return 0 indicating that we don't know.
 598          */
 599         return 0;
 600 }
 601
 602 void set_blocksize(kdev_t dev, int size)
 603 {
 604         extern int *blksize_size[];
 605         int i, nlist;
 606         struct buffer_head * bh, *bhnext;
 607
 608         if (!blksize_size[MAJOR(dev)])
 609                 return;
 610
 611         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 612         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 613                 panic("Invalid blocksize passed to set_blocksize");
 614
 615         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 616                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 617                 return;
 618         }
 619         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 620                 return;
 621         sync_buffers(dev, 2);
 622         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 623
 624         /* We need to be quite careful how we do this - we are moving entries
 625          * around on the free list, and we can get in a loop if we are not careful.
 626          */
 627         for(nlist = 0; nlist < NR_LIST; nlist++) {
 628                 bh = lru_list[nlist];
 629                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 630                         if(!bh)
 631                                 break;
 632
 633                         bhnext = bh->b_next_free;
 634                         if (bh->b_dev != dev)
 635                                  continue;
 636                         if (bh->b_size == size)
 637                                  continue;
 638                         bhnext->b_count++;
 639                         wait_on_buffer(bh);
 640                         bhnext->b_count--;
 641                         if (bh->b_dev == dev && bh->b_size != size) {
 642                                 clear_bit(BH_Dirty, &bh->b_state);
 643                                 clear_bit(BH_Uptodate, &bh->b_state);
 644                                 clear_bit(BH_Req, &bh->b_state);
 645                                 bh->b_flushtime = 0;
 646                         }
 647                         remove_from_queues(bh);
 648                         bh->b_dev=B_FREE;
 649                         insert_into_queues(bh);
 650                 }
 651         }
 652 }
 653
 654 /*
 655  * We used to try various strange things. Let's not.
 656  */
 657 static void refill_freelist(int size)
 658 {
 659         if (!grow_buffers(size)) {
 660                 wakeup_bdflush(1);
 661                 current->policy |= SCHED_YIELD;
 662                 schedule();
 663         }
 664 }
 665
 666 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 667                  bh_end_io_t *handler, void *dev_id)
 668 {
 669         bh->b_count = 1;
 670         bh->b_list = BUF_CLEAN;
 671         bh->b_flushtime = 0;
 672         bh->b_dev = dev;
 673         bh->b_blocknr = block;
 674         bh->b_end_io = handler;
 675         bh->b_dev_id = dev_id;
 676 }
 677
 678 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 679 {
 680         mark_buffer_uptodate(bh, uptodate);
 681         unlock_buffer(bh);
 682 }
 683
 684 /*
 685  * Ok, this is getblk, and it isn't very clear, again to hinder
 686  * race-conditions. Most of the code is seldom used, (ie repeating),
 687  * so it should be much more efficient than it looks.
 688  *
 689  * The algorithm is changed: hopefully better, and an elusive bug removed.
 690  *
 691  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 692  * when the filesystem starts to get full of dirty blocks (I hope).
 693  */
 694 struct buffer_head * getblk(kdev_t dev, int block, int size)
 695 {
 696         struct buffer_head * bh;
 697         int isize;
 698
 699 repeat:
 700         bh = get_hash_table(dev, block, size);
 701         if (bh) {
 702                 if (!buffer_dirty(bh)) {
 703                         bh->b_flushtime = 0;
 704                 }
 705                 return bh;
 706         }
 707
 708         isize = BUFSIZE_INDEX(size);
 709 get_free:
 710         bh = free_list[isize];
 711         if (!bh)
 712                 goto refill;
 713         remove_from_free_list(bh);
 714
 715         /* OK, FINALLY we know that this buffer is the only one of its kind,
 716          * and that it's unused (b_count=0), unlocked, and clean.
 717          */
 718         init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
 719         bh->b_state=0;
 720         insert_into_queues(bh);
 721         return bh;
 722
 723         /*
 724          * If we block while refilling the free list, somebody may
 725          * create the buffer first ... search the hashes again.
 726          */
 727 refill:
 728         refill_freelist(size);
 729         if (!find_buffer(dev,block,size))
 730                 goto get_free;
 731         goto repeat;
 732 }
 733
 734 void set_writetime(struct buffer_head * buf, int flag)
 735 {
 736         int newtime;
 737
 738         if (buffer_dirty(buf)) {
 739                 /* Move buffer to dirty list if jiffies is clear. */
 740                 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
 741                                      bdf_prm.b_un.age_buffer);
 742                 if(!buf->b_flushtime || buf->b_flushtime > newtime)
 743                          buf->b_flushtime = newtime;
 744         } else {
 745                 buf->b_flushtime = 0;
 746         }
 747 }
 748
 749
 750 /*
 751  * Put a buffer into the appropriate list, without side-effects.
 752  */
 753 static inline void file_buffer(struct buffer_head *bh, int list)
 754 {
 755         remove_from_queues(bh);
 756         bh->b_list = list;
 757         insert_into_queues(bh);
 758 }
 759
 760 /*
 761  * A buffer may need to be moved from one buffer list to another
 762  * (e.g. in case it is not shared any more). Handle this.
 763  */
 764 void refile_buffer(struct buffer_head * buf)
 765 {
 766         int dispose;
 767
 768         if(buf->b_dev == B_FREE) {
 769                 printk("Attempt to refile free buffer\n");
 770                 return;
 771         }
 772         if (buffer_dirty(buf))
 773                 dispose = BUF_DIRTY;
 774         else if (buffer_locked(buf))
 775                 dispose = BUF_LOCKED;
 776         else
 777                 dispose = BUF_CLEAN;
 778         if(dispose != buf->b_list) {
 779                 file_buffer(buf, dispose);
 780                 if(dispose == BUF_DIRTY) {
 781                         int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
 782
 783                         /* This buffer is dirty, maybe we need to start flushing.
 784                          * If too high a percentage of the buffers are dirty...
 785                          */
 786                         if (nr_buffers_type[BUF_DIRTY] > too_many)
 787                                 wakeup_bdflush(1);
 788
 789                         /* If this is a loop device, and
 790                          * more than half of the buffers are dirty...
 791                          * (Prevents no-free-buffers deadlock with loop device.)
 792                          */
 793                         if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
 794                             nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
 795                                 wakeup_bdflush(1);
 796                 }
 797         }
 798 }
 799
 800 /*
 801  * Release a buffer head
 802  */
 803 void __brelse(struct buffer_head * buf)
 804 {
 805         /* If dirty, mark the time this buffer should be written back. */
 806         set_writetime(buf, 0);
 807         refile_buffer(buf);
 808         touch_buffer(buf);
 809
 810         if (buf->b_count) {
 811                 buf->b_count--;
 812                 return;
 813         }
 814         printk("VFS: brelse: Trying to free free buffer\n");
 815 }
 816
 817 /*
 818  * bforget() is like brelse(), except it puts the buffer on the
 819  * free list if it can.. We can NOT free the buffer if:
 820  *  - there are other users of it
 821  *  - it is locked and thus can have active IO
 822  */
 823 void __bforget(struct buffer_head * buf)
 824 {
 825         if (buf->b_count != 1 || buffer_locked(buf)) {
 826                 __brelse(buf);
 827                 return;
 828         }
 829         buf->b_count = 0;
 830         buf->b_state = 0;
 831         remove_from_queues(buf);
 832         put_last_free(buf);
 833 }
 834
 835 /*
 836  * bread() reads a specified block and returns the buffer that contains
 837  * it. It returns NULL if the block was unreadable.
 838  */
 839 struct buffer_head * bread(kdev_t dev, int block, int size)
 840 {
 841         struct buffer_head * bh;
 842
 843         bh = getblk(dev, block, size);
 844         if (buffer_uptodate(bh))
 845                 return bh;
 846         ll_rw_block(READ, 1, &bh);
 847         wait_on_buffer(bh);
 848         if (buffer_uptodate(bh))
 849                 return bh;
 850         brelse(bh);
 851         return NULL;
 852 }
 853
 854 /*
 855  * Ok, breada can be used as bread, but additionally to mark other
 856  * blocks for reading as well. End the argument list with a negative
 857  * number.
 858  */
 859
 860 #define NBUF 16
 861
 862 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 863         unsigned int pos, unsigned int filesize)
 864 {
 865         struct buffer_head * bhlist[NBUF];
 866         unsigned int blocks;
 867         struct buffer_head * bh;
 868         int index;
 869         int i, j;
 870
 871         if (pos >= filesize)
 872                 return NULL;
 873
 874         if (block < 0)
 875                 return NULL;
 876
 877         bh = getblk(dev, block, bufsize);
 878         index = BUFSIZE_INDEX(bh->b_size);
 879
 880         if (buffer_uptodate(bh))
 881                 return(bh);
 882         else ll_rw_block(READ, 1, &bh);
 883
 884         blocks = (filesize - pos) >> (9+index);
 885
 886         if (blocks < (read_ahead[MAJOR(dev)] >> index))
 887                 blocks = read_ahead[MAJOR(dev)] >> index;
 888         if (blocks > NBUF)
 889                 blocks = NBUF;
 890
 891 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 892
 893
 894         bhlist[0] = bh;
 895         j = 1;
 896         for(i=1; i<blocks; i++) {
 897                 bh = getblk(dev,block+i,bufsize);
 898                 if (buffer_uptodate(bh)) {
 899                         brelse(bh);
 900                         break;
 901                 }
 902                 else bhlist[j++] = bh;
 903         }
 904
 905         /* Request the read for these buffers, and then release them. */
 906         if (j>1)
 907                 ll_rw_block(READA, (j-1), bhlist+1);
 908         for(i=1; i<j; i++)
 909                 brelse(bhlist[i]);
 910
 911         /* Wait for this buffer, and then continue on. */
 912         bh = bhlist[0];
 913         wait_on_buffer(bh);
 914         if (buffer_uptodate(bh))
 915                 return bh;
 916         brelse(bh);
 917         return NULL;
 918 }
 919
 920 /*
 921  * Note: the caller should wake up the buffer_wait list if needed.
 922  */
 923 static void put_unused_buffer_head(struct buffer_head * bh)
 924 {
 925         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
 926                 nr_buffer_heads--;
 927                 kmem_cache_free(bh_cachep, bh);
 928                 return;
 929         }
 930
 931         memset(bh,0,sizeof(*bh));
 932         init_waitqueue_head(&bh->b_wait);
 933         nr_unused_buffer_heads++;
 934         bh->b_next_free = unused_list;
 935         unused_list = bh;
 936 }
 937
 938 /*
 939  * We can't put completed temporary IO buffer_heads directly onto the
 940  * unused_list when they become unlocked, since the device driver
 941  * end_request routines still expect access to the buffer_head's
 942  * fields after the final unlock.  So, the device driver puts them on
 943  * the reuse_list instead once IO completes, and we recover these to
 944  * the unused_list here.
 945  *
 946  * Note that we don't do a wakeup here, but return a flag indicating
 947  * whether we got any buffer heads. A task ready to sleep can check
 948  * the returned value, and any tasks already sleeping will have been
 949  * awakened when the buffer heads were added to the reuse list.
 950  */
 951 static inline int recover_reusable_buffer_heads(void)
 952 {
 953         struct buffer_head *head = xchg(&reuse_list, NULL);
 954         int found = 0;
 955
 956         if (head) {
 957                 do {
 958                         struct buffer_head *bh = head;
 959                         head = head->b_next_free;
 960                         put_unused_buffer_head(bh);
 961                 } while (head);
 962                 found = 1;
 963         }
 964         return found;
 965 }
 966
 967 /*
 968  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
 969  * no-buffer-head deadlock.  Return NULL on failure; waiting for
 970  * buffer heads is now handled in create_buffers().
 971  */
 972 static struct buffer_head * get_unused_buffer_head(int async)
 973 {
 974         struct buffer_head * bh;
 975
 976         recover_reusable_buffer_heads();
 977         if (nr_unused_buffer_heads > NR_RESERVED) {
 978                 bh = unused_list;
 979                 unused_list = bh->b_next_free;
 980                 nr_unused_buffer_heads--;
 981                 return bh;
 982         }
 983
 984         /* This is critical.  We can't swap out pages to get
 985          * more buffer heads, because the swap-out may need
 986          * more buffer-heads itself.  Thus SLAB_BUFFER.
 987          */
 988         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
 989                 memset(bh, 0, sizeof(*bh));
 990                 init_waitqueue_head(&bh->b_wait);
 991                 nr_buffer_heads++;
 992                 return bh;
 993         }
 994
 995         /*
 996          * If we need an async buffer, use the reserved buffer heads.
 997          */
 998         if (async && unused_list) {
 999                 bh = unused_list;
1000                 unused_list = bh->b_next_free;
1001                 nr_unused_buffer_heads--;
1002                 return bh;
1003         }
1004
1005 #if 0
1006         /*
1007          * (Pending further analysis ...)
1008          * Ordinary (non-async) requests can use a different memory priority
1009          * to free up pages. Any swapping thus generated will use async
1010          * buffer heads.
1011          */
1012         if(!async &&
1013            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1014                 memset(bh, 0, sizeof(*bh));
1015                 init_waitqueue_head(&bh->b_wait);
1016                 nr_buffer_heads++;
1017                 return bh;
1018         }
1019 #endif
1020
1021         return NULL;
1022 }
1023
1024 /*
1025  * Create the appropriate buffers when given a page for data area and
1026  * the size of each buffer.. Use the bh->b_this_page linked list to
1027  * follow the buffers created.  Return NULL if unable to create more
1028  * buffers.
1029  * The async flag is used to differentiate async IO (paging, swapping)
1030  * from ordinary buffer allocations, and only async requests are allowed
1031  * to sleep waiting for buffer heads.
1032  */
1033 static struct buffer_head * create_buffers(unsigned long page,
1034                                                 unsigned long size, int async)
1035 {
1036         DECLARE_WAITQUEUE(wait, current);
1037         struct buffer_head *bh, *head;
1038         long offset;
1039
1040 try_again:
1041         head = NULL;
1042         offset = PAGE_SIZE;
1043         while ((offset -= size) >= 0) {
1044                 bh = get_unused_buffer_head(async);
1045                 if (!bh)
1046                         goto no_grow;
1047
1048                 bh->b_dev = B_FREE;  /* Flag as unused */
1049                 bh->b_this_page = head;
1050                 head = bh;
1051
1052                 bh->b_state = 0;
1053                 bh->b_next_free = NULL;
1054                 bh->b_count = 0;
1055                 bh->b_size = size;
1056
1057                 bh->b_data = (char *) (page+offset);
1058                 bh->b_list = 0;
1059         }
1060         return head;
1061 /*
1062  * In case anything failed, we just free everything we got.
1063  */
1064 no_grow:
1065         if (head) {
1066                 do {
1067                         bh = head;
1068                         head = head->b_this_page;
1069                         put_unused_buffer_head(bh);
1070                 } while (head);
1071
1072                 /* Wake up any waiters ... */
1073                 wake_up(&buffer_wait);
1074         }
1075
1076         /*
1077          * Return failure for non-async IO requests.  Async IO requests
1078          * are not allowed to fail, so we have to wait until buffer heads
1079          * become available.  But we don't want tasks sleeping with
1080          * partially complete buffers, so all were released above.
1081          */
1082         if (!async)
1083                 return NULL;
1084
1085         /* We're _really_ low on memory. Now we just
1086          * wait for old buffer heads to become free due to
1087          * finishing IO.  Since this is an async request and
1088          * the reserve list is empty, we're sure there are
1089          * async buffer heads in use.
1090          */
1091         run_task_queue(&tq_disk);
1092
1093         /*
1094          * Set our state for sleeping, then check again for buffer heads.
1095          * This ensures we won't miss a wake_up from an interrupt.
1096          */
1097         add_wait_queue(&buffer_wait, &wait);
1098         current->state = TASK_UNINTERRUPTIBLE;
1099         if (!recover_reusable_buffer_heads())
1100                 schedule();
1101         remove_wait_queue(&buffer_wait, &wait);
1102         current->state = TASK_RUNNING;
1103         goto try_again;
1104 }
1105
1106 /* Run the hooks that have to be done when a page I/O has completed. */
1107 static inline void after_unlock_page (struct page * page)
1108 {
1109         if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1110                 atomic_dec(&nr_async_pages);
1111 #ifdef DEBUG_SWAP
1112                 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1113                         (char *) page_address(page),
1114                         atomic_read(&nr_async_pages));
1115 #endif
1116         }
1117         if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1118                 swap_after_unlock_page(page->offset);
1119         if (test_and_clear_bit(PG_free_after, &page->flags))
1120                 __free_page(page);
1121 }
1122
1123 /*
1124  * Free all temporary buffers belonging to a page.
1125  * This needs to be called with interrupts disabled.
1126  */
1127 static inline void free_async_buffers (struct buffer_head * bh)
1128 {
1129         struct buffer_head *tmp, *tail;
1130
1131         /*
1132          * Link all the buffers into the b_next_free list,
1133          * so we only have to do one xchg() operation ...
1134          */
1135         tail = bh;
1136         while ((tmp = tail->b_this_page) != bh) {
1137                 tail->b_next_free = tmp;
1138                 tail = tmp;
1139         };
1140
1141         /* Update the reuse list */
1142         tail->b_next_free = xchg(&reuse_list, NULL);
1143         reuse_list = bh;
1144
1145         /* Wake up any waiters ... */
1146         wake_up(&buffer_wait);
1147 }
1148
1149 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1150 {
1151         unsigned long flags;
1152         struct buffer_head *tmp;
1153         struct page *page;
1154
1155         mark_buffer_uptodate(bh, uptodate);
1156         unlock_buffer(bh);
1157
1158         /* This is a temporary buffer used for page I/O. */
1159         page = mem_map + MAP_NR(bh->b_data);
1160         if (!PageLocked(page))
1161                 goto not_locked;
1162         if (bh->b_count != 1)
1163                 goto bad_count;
1164
1165         if (!test_bit(BH_Uptodate, &bh->b_state))
1166                 set_bit(PG_error, &page->flags);
1167
1168         /*
1169          * Be _very_ careful from here on. Bad things can happen if
1170          * two buffer heads end IO at almost the same time and both
1171          * decide that the page is now completely done.
1172          *
1173          * Async buffer_heads are here only as labels for IO, and get
1174          * thrown away once the IO for this page is complete.  IO is
1175          * deemed complete once all buffers have been visited
1176          * (b_count==0) and are now unlocked. We must make sure that
1177          * only the _last_ buffer that decrements its count is the one
1178          * that free's the page..
1179          */
1180         save_flags(flags);
1181         cli();
1182         bh->b_count--;
1183         tmp = bh;
1184         do {
1185                 if (tmp->b_count)
1186                         goto still_busy;
1187                 tmp = tmp->b_this_page;
1188         } while (tmp != bh);
1189
1190         /* OK, the async IO on this page is complete. */
1191         free_async_buffers(bh);
1192         restore_flags(flags);
1193         clear_bit(PG_locked, &page->flags);
1194         wake_up(&page->wait);
1195         after_unlock_page(page);
1196         return;
1197
1198 still_busy:
1199         restore_flags(flags);
1200         return;
1201
1202 not_locked:
1203         printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1204         return;
1205
1206 bad_count:
1207         printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1208         return;
1209 }
1210
1211 /*
1212  * Start I/O on a page.
1213  * This function expects the page to be locked and may return before I/O is complete.
1214  * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1215  */
1216 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1217 {
1218         struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1219         int block, nr;
1220
1221         if (!PageLocked(page))
1222                 panic("brw_page: page not locked for I/O");
1223         clear_bit(PG_uptodate, &page->flags);
1224         clear_bit(PG_error, &page->flags);
1225         /*
1226          * Allocate async buffer heads pointing to this page, just for I/O.
1227          * They do _not_ show up in the buffer hash table!
1228          * They are _not_ registered in page->buffers either!
1229          */
1230         bh = create_buffers(page_address(page), size, 1);
1231         if (!bh) {
1232                 /* WSH: exit here leaves page->count incremented */
1233                 clear_bit(PG_locked, &page->flags);
1234                 wake_up(&page->wait);
1235                 return -ENOMEM;
1236         }
1237         nr = 0;
1238         next = bh;
1239         do {
1240                 struct buffer_head * tmp;
1241                 block = *(b++);
1242
1243                 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1244                 set_bit(BH_Uptodate, &next->b_state);
1245
1246                 /*
1247                  * When we use bmap, we define block zero to represent
1248                  * a hole.  ll_rw_page, however, may legitimately
1249                  * access block zero, and we need to distinguish the
1250                  * two cases.
1251                  */
1252                 if (bmap && !block) {
1253                         memset(next->b_data, 0, size);
1254                         next->b_count--;
1255                         continue;
1256                 }
1257                 tmp = get_hash_table(dev, block, size);
1258                 if (tmp) {
1259                         if (!buffer_uptodate(tmp)) {
1260                                 if (rw == READ)
1261                                         ll_rw_block(READ, 1, &tmp);
1262                                 wait_on_buffer(tmp);
1263                         }
1264                         if (rw == READ)
1265                                 memcpy(next->b_data, tmp->b_data, size);
1266                         else {
1267                                 memcpy(tmp->b_data, next->b_data, size);
1268                                 mark_buffer_dirty(tmp, 0);
1269                         }
1270                         brelse(tmp);
1271                         next->b_count--;
1272                         continue;
1273                 }
1274                 if (rw == READ)
1275                         clear_bit(BH_Uptodate, &next->b_state);
1276                 else
1277                         set_bit(BH_Dirty, &next->b_state);
1278                 arr[nr++] = next;
1279         } while (prev = next, (next = next->b_this_page) != NULL);
1280         prev->b_this_page = bh;
1281
1282         if (nr) {
1283                 ll_rw_block(rw, nr, arr);
1284                 /* The rest of the work is done in mark_buffer_uptodate()
1285                  * and unlock_buffer(). */
1286         } else {
1287                 unsigned long flags;
1288                 clear_bit(PG_locked, &page->flags);
1289                 set_bit(PG_uptodate, &page->flags);
1290                 wake_up(&page->wait);
1291                 save_flags(flags);
1292                 cli();
1293                 free_async_buffers(bh);
1294                 restore_flags(flags);
1295                 after_unlock_page(page);
1296         }
1297         ++current->maj_flt;
1298         return 0;
1299 }
1300
1301 /*
1302  * This is called by end_request() when I/O has completed.
1303  */
1304 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1305 {
1306         if (on) {
1307                 struct buffer_head *tmp = bh;
1308                 set_bit(BH_Uptodate, &bh->b_state);
1309                 /* If a page has buffers and all these buffers are uptodate,
1310                  * then the page is uptodate. */
1311                 do {
1312                         if (!test_bit(BH_Uptodate, &tmp->b_state))
1313                                 return;
1314                         tmp=tmp->b_this_page;
1315                 } while (tmp && tmp != bh);
1316                 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1317                 return;
1318         }
1319         clear_bit(BH_Uptodate, &bh->b_state);
1320 }
1321
1322 /*
1323  * Generic "readpage" function for block devices that have the normal
1324  * bmap functionality. This is most of the block device filesystems.
1325  * Reads the page asynchronously --- the unlock_buffer() and
1326  * mark_buffer_uptodate() functions propagate buffer state into the
1327  * page struct once IO has completed.
1328  */
1329 int generic_readpage(struct file * file, struct page * page)
1330 {
1331         struct dentry *dentry = file->f_dentry;
1332         struct inode *inode = dentry->d_inode;
1333         unsigned long block;
1334         int *p, nr[PAGE_SIZE/512];
1335         int i;
1336
1337         atomic_inc(&page->count);
1338         set_bit(PG_locked, &page->flags);
1339         set_bit(PG_free_after, &page->flags);
1340
1341         i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1342         block = page->offset >> inode->i_sb->s_blocksize_bits;
1343         p = nr;
1344         do {
1345                 *p = inode->i_op->bmap(inode, block);
1346                 i--;
1347                 block++;
1348                 p++;
1349         } while (i > 0);
1350
1351         /* IO start */
1352         brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1353         return 0;
1354 }
1355
1356 /*
1357  * Try to increase the number of buffers available: the size argument
1358  * is used to determine what kind of buffers we want.
1359  */
1360 static int grow_buffers(int size)
1361 {
1362         unsigned long page;
1363         struct buffer_head *bh, *tmp;
1364         struct buffer_head * insert_point;
1365         int isize;
1366
1367         if ((size & 511) || (size > PAGE_SIZE)) {
1368                 printk("VFS: grow_buffers: size = %d\n",size);
1369                 return 0;
1370         }
1371
1372         if (!(page = __get_free_page(GFP_BUFFER)))
1373                 return 0;
1374         bh = create_buffers(page, size, 0);
1375         if (!bh) {
1376                 free_page(page);
1377                 return 0;
1378         }
1379
1380         isize = BUFSIZE_INDEX(size);
1381         insert_point = free_list[isize];
1382
1383         tmp = bh;
1384         while (1) {
1385                 if (insert_point) {
1386                         tmp->b_next_free = insert_point->b_next_free;
1387                         tmp->b_prev_free = insert_point;
1388                         insert_point->b_next_free->b_prev_free = tmp;
1389                         insert_point->b_next_free = tmp;
1390                 } else {
1391                         tmp->b_prev_free = tmp;
1392                         tmp->b_next_free = tmp;
1393                 }
1394                 insert_point = tmp;
1395                 ++nr_buffers;
1396                 if (tmp->b_this_page)
1397                         tmp = tmp->b_this_page;
1398                 else
1399                         break;
1400         }
1401         tmp->b_this_page = bh;
1402         free_list[isize] = bh;
1403         mem_map[MAP_NR(page)].buffers = bh;
1404         buffermem += PAGE_SIZE;
1405         return 1;
1406 }
1407
1408 /*
1409  * Can the buffer be thrown out?
1410  */
1411 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1412 #define buffer_busy(bh)         ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1413
1414 /*
1415  * try_to_free_buffers() checks if all the buffers on this particular page
1416  * are unused, and free's the page if so.
1417  *
1418  * Wake up bdflush() if this fails - if we're running low on memory due
1419  * to dirty buffers, we need to flush them out as quickly as possible.
1420  */
1421 int try_to_free_buffers(struct page * page_map)
1422 {
1423         struct buffer_head * tmp, * bh = page_map->buffers;
1424
1425         tmp = bh;
1426         do {
1427                 struct buffer_head * p = tmp;
1428
1429                 tmp = tmp->b_this_page;
1430                 if (!buffer_busy(p))
1431                         continue;
1432
1433                 wakeup_bdflush(0);
1434                 return 0;
1435         } while (tmp != bh);
1436
1437         tmp = bh;
1438         do {
1439                 struct buffer_head * p = tmp;
1440                 tmp = tmp->b_this_page;
1441                 nr_buffers--;
1442                 remove_from_queues(p);
1443                 put_unused_buffer_head(p);
1444         } while (tmp != bh);
1445
1446         /* Wake up anyone waiting for buffer heads */
1447         wake_up(&buffer_wait);
1448
1449         /* And free the page */
1450         buffermem -= PAGE_SIZE;
1451         page_map->buffers = NULL;
1452         __free_page(page_map);
1453         return 1;
1454 }
1455
1456 /* ================== Debugging =================== */
1457
1458 void show_buffers(void)
1459 {
1460         struct buffer_head * bh;
1461         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1462         int protected = 0;
1463         int nlist;
1464         static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1465
1466         printk("Buffer memory:   %6dkB\n",buffermem>>10);
1467         printk("Buffer heads:    %6d\n",nr_buffer_heads);
1468         printk("Buffer blocks:   %6d\n",nr_buffers);
1469         printk("Buffer hashed:   %6d\n",nr_hashed_buffers);
1470
1471         for(nlist = 0; nlist < NR_LIST; nlist++) {
1472           found = locked = dirty = used = lastused = protected = 0;
1473           bh = lru_list[nlist];
1474           if(!bh) continue;
1475
1476           do {
1477                 found++;
1478                 if (buffer_locked(bh))
1479                         locked++;
1480                 if (buffer_protected(bh))
1481                         protected++;
1482                 if (buffer_dirty(bh))
1483                         dirty++;
1484                 if (bh->b_count)
1485                         used++, lastused = found;
1486                 bh = bh->b_next_free;
1487           } while (bh != lru_list[nlist]);
1488           printk("%8s: %d buffers, %d used (last=%d), "
1489                  "%d locked, %d protected, %d dirty\n",
1490                  buf_types[nlist], found, used, lastused,
1491                  locked, protected, dirty);
1492         };
1493 }
1494
1495
1496 /* ===================== Init ======================= */
1497
1498 /*
1499  * allocate the hash table and init the free list
1500  * Use gfp() for the hash table to decrease TLB misses, use
1501  * SLAB cache for buffer heads.
1502  */
1503 void __init buffer_init(unsigned long memory_size)
1504 {
1505         int order;
1506         unsigned int nr_hash;
1507
1508         /* we need to guess at the right sort of size for a buffer cache.
1509            the heuristic from working with large databases and getting
1510            fsync times (ext2) manageable, is the following */
1511
1512         memory_size >>= 20;
1513         for (order = 5; (1UL << order) < memory_size; order++);
1514
1515         /* try to allocate something until we get it or we're asking
1516            for something that is really too small */
1517
1518         do {
1519                 nr_hash = (1UL << order) * PAGE_SIZE /
1520                     sizeof(struct buffer_head *);
1521                 hash_table = (struct buffer_head **)
1522                     __get_free_pages(GFP_ATOMIC, order);
1523         } while (hash_table == NULL && --order > 4);
1524
1525         if (!hash_table)
1526                 panic("Failed to allocate buffer hash table\n");
1527         memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1528         bh_hash_mask = nr_hash-1;
1529
1530         bh_cachep = kmem_cache_create("buffer_head",
1531                                       sizeof(struct buffer_head),
1532                                       0,
1533                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1534         if(!bh_cachep)
1535                 panic("Cannot create buffer head SLAB cache\n");
1536         /*
1537          * Allocate the reserved buffer heads.
1538          */
1539         while (nr_buffer_heads < NR_RESERVED) {
1540                 struct buffer_head * bh;
1541
1542                 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1543                 if (!bh)
1544                         break;
1545                 put_unused_buffer_head(bh);
1546                 nr_buffer_heads++;
1547         }
1548
1549         lru_list[BUF_CLEAN] = 0;
1550         grow_buffers(BLOCK_SIZE);
1551 }
1552
1553
1554 /* ====================== bdflush support =================== */
1555
1556 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1557  * response to dirty buffers.  Once this process is activated, we write back
1558  * a limited number of buffers to the disks and then go back to sleep again.
1559  */
1560 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1561 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1562 struct task_struct *bdflush_tsk = 0;
1563
1564 void wakeup_bdflush(int wait)
1565 {
1566         if (current == bdflush_tsk)
1567                 return;
1568         wake_up(&bdflush_wait);
1569         if (wait) {
1570                 run_task_queue(&tq_disk);
1571                 sleep_on(&bdflush_done);
1572         }
1573 }
1574
1575
1576 /*
1577  * Here we attempt to write back old buffers.  We also try to flush inodes
1578  * and supers as well, since this function is essentially "update", and
1579  * otherwise there would be no way of ensuring that these quantities ever
1580  * get written back.  Ideally, we would have a timestamp on the inodes
1581  * and superblocks so that we could write back only the old ones as well
1582  */
1583
1584 static int sync_old_buffers(void)
1585 {
1586         int i;
1587         int ndirty, nwritten;
1588         int nlist;
1589         int ncount;
1590         struct buffer_head * bh, *next;
1591
1592         sync_supers(0);
1593         sync_inodes(0);
1594
1595         ncount = 0;
1596 #ifdef DEBUG
1597         for(nlist = 0; nlist < NR_LIST; nlist++)
1598 #else
1599         for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1600 #endif
1601         {
1602                 ndirty = 0;
1603                 nwritten = 0;
1604         repeat:
1605
1606                 bh = lru_list[nlist];
1607                 if(bh)
1608                          for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1609                                  /* We may have stalled while waiting for I/O to complete. */
1610                                  if(bh->b_list != nlist) goto repeat;
1611                                  next = bh->b_next_free;
1612                                  if(!lru_list[nlist]) {
1613                                          printk("Dirty list empty %d\n", i);
1614                                          break;
1615                                  }
1616
1617                                  /* Clean buffer on dirty list?  Refile it */
1618                                  if (nlist == BUF_DIRTY && !buffer_dirty(bh) && !buffer_locked(bh)) {
1619                                          refile_buffer(bh);
1620                                          continue;
1621                                  }
1622
1623                                   /* Unlocked buffer on locked list?  Refile it */
1624                                   if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1625                                           refile_buffer(bh);
1626                                           continue;
1627                                   }
1628
1629                                  if (buffer_locked(bh) || !buffer_dirty(bh))
1630                                           continue;
1631                                  ndirty++;
1632                                  if(time_before(jiffies, bh->b_flushtime))
1633                                         continue;
1634                                  nwritten++;
1635                                  next->b_count++;
1636                                  bh->b_count++;
1637                                  bh->b_flushtime = 0;
1638 #ifdef DEBUG
1639                                  if(nlist != BUF_DIRTY) ncount++;
1640 #endif
1641                                  ll_rw_block(WRITE, 1, &bh);
1642                                  bh->b_count--;
1643                                  next->b_count--;
1644                          }
1645         }
1646         run_task_queue(&tq_disk);
1647 #ifdef DEBUG
1648         if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1649         printk("Wrote %d/%d buffers\n", nwritten, ndirty);
1650 #endif
1651         run_task_queue(&tq_disk);
1652         return 0;
1653 }
1654
1655
1656 /* This is the interface to bdflush.  As we get more sophisticated, we can
1657  * pass tuning parameters to this "process", to adjust how it behaves.
1658  * We would want to verify each parameter, however, to make sure that it
1659  * is reasonable. */
1660
1661 asmlinkage int sys_bdflush(int func, long data)
1662 {
1663         int i, error = -EPERM;
1664
1665         lock_kernel();
1666         if (!capable(CAP_SYS_ADMIN))
1667                 goto out;
1668
1669         if (func == 1) {
1670                  error = sync_old_buffers();
1671                  goto out;
1672         }
1673
1674         /* Basically func 1 means read param 1, 2 means write param 1, etc */
1675         if (func >= 2) {
1676                 i = (func-2) >> 1;
1677                 error = -EINVAL;
1678                 if (i < 0 || i >= N_PARAM)
1679                         goto out;
1680                 if((func & 1) == 0) {
1681                         error = put_user(bdf_prm.data[i], (int*)data);
1682                         goto out;
1683                 }
1684                 if (data < bdflush_min[i] || data > bdflush_max[i])
1685                         goto out;
1686                 bdf_prm.data[i] = data;
1687                 error = 0;
1688                 goto out;
1689         };
1690
1691         /* Having func 0 used to launch the actual bdflush and then never
1692          * return (unless explicitly killed). We return zero here to
1693          * remain semi-compatible with present update(8) programs.
1694          */
1695         error = 0;
1696 out:
1697         unlock_kernel();
1698         return error;
1699 }
1700
1701 /* This is the actual bdflush daemon itself. It used to be started from
1702  * the syscall above, but now we launch it ourselves internally with
1703  * kernel_thread(...)  directly after the first thread in init/main.c */
1704
1705 /* To prevent deadlocks for a loop device:
1706  * 1) Do non-blocking writes to loop (avoids deadlock with running
1707  *      out of request blocks).
1708  * 2) But do a blocking write if the only dirty buffers are loop buffers
1709  *      (otherwise we go into an infinite busy-loop).
1710  * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1711  *      with running out of free buffers for loop's "real" device).
1712 */
1713 int bdflush(void * unused)
1714 {
1715         int i;
1716         int ndirty;
1717         int nlist;
1718         int ncount;
1719         struct buffer_head * bh, *next;
1720         int major;
1721         int wrta_cmd = WRITEA;  /* non-blocking write for LOOP */
1722
1723         /*
1724          *      We have a bare-bones task_struct, and really should fill
1725          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
1726          *      display semi-sane things. Not real crucial though...
1727          */
1728
1729         current->session = 1;
1730         current->pgrp = 1;
1731         sprintf(current->comm, "kflushd");
1732         bdflush_tsk = current;
1733
1734         /*
1735          *      As a kernel thread we want to tamper with system buffers
1736          *      and other internals and thus be subject to the SMP locking
1737          *      rules. (On a uniprocessor box this does nothing).
1738          */
1739         lock_kernel();
1740
1741         for (;;) {
1742 #ifdef DEBUG
1743                 printk("bdflush() activated...");
1744 #endif
1745
1746                 CHECK_EMERGENCY_SYNC
1747
1748                 ncount = 0;
1749 #ifdef DEBUG
1750                 for(nlist = 0; nlist < NR_LIST; nlist++)
1751 #else
1752                 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++)
1753 #endif
1754                  {
1755                          ndirty = 0;
1756                  repeat:
1757
1758                          bh = lru_list[nlist];
1759                          if(bh)
1760                                   for (i = nr_buffers_type[nlist]; i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1761                                        bh = next) {
1762                                           /* We may have stalled while waiting for I/O to complete. */
1763                                           if(bh->b_list != nlist) goto repeat;
1764                                           next = bh->b_next_free;
1765                                           if(!lru_list[nlist]) {
1766                                                   printk("Dirty list empty %d\n", i);
1767                                                   break;
1768                                           }
1769
1770                                           /* Clean buffer on dirty list?  Refile it */
1771                                           if (nlist == BUF_DIRTY && !buffer_dirty(bh)) {
1772                                                   refile_buffer(bh);
1773                                                   continue;
1774                                           }
1775
1776                                           /* Unlocked buffer on locked list?  Refile it */
1777                                           if (nlist == BUF_LOCKED && !buffer_locked(bh)) {
1778                                                   refile_buffer(bh);
1779                                                   continue;
1780                                           }
1781
1782                                           if (buffer_locked(bh) || !buffer_dirty(bh))
1783                                                    continue;
1784                                           major = MAJOR(bh->b_dev);
1785                                           /* Should we write back buffers that are shared or not??
1786                                              currently dirty buffers are not shared, so it does not matter */
1787                                           next->b_count++;
1788                                           bh->b_count++;
1789                                           ndirty++;
1790                                           bh->b_flushtime = 0;
1791                                           if (major == LOOP_MAJOR) {
1792                                                   ll_rw_block(wrta_cmd,1, &bh);
1793                                                   wrta_cmd = WRITEA;
1794                                                   if (buffer_dirty(bh))
1795                                                           --ndirty;
1796                                           }
1797                                           else
1798                                           ll_rw_block(WRITE, 1, &bh);
1799 #ifdef DEBUG
1800                                           if(nlist != BUF_DIRTY) ncount++;
1801 #endif
1802                                           bh->b_count--;
1803                                           next->b_count--;
1804                                   }
1805                  }
1806 #ifdef DEBUG
1807                 if (ncount) printk("sys_bdflush: %d dirty buffers not on dirty list\n", ncount);
1808                 printk("sleeping again.\n");
1809 #endif
1810                 /* If we didn't write anything, but there are still
1811                  * dirty buffers, then make the next write to a
1812                  * loop device to be a blocking write.
1813                  * This lets us block--which we _must_ do! */
1814                 if (ndirty == 0 && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1815                         wrta_cmd = WRITE;
1816                         continue;
1817                 }
1818                 run_task_queue(&tq_disk);
1819                 wake_up(&bdflush_done);
1820
1821                 /* If there are still a lot of dirty buffers around, skip the sleep
1822                    and flush some more */
1823                 if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
1824                         spin_lock_irq(&current->sigmask_lock);
1825                         flush_signals(current);
1826                         spin_unlock_irq(&current->sigmask_lock);
1827
1828                         interruptible_sleep_on(&bdflush_wait);
1829                 }
1830         }
1831 }