fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 #include <linux/malloc.h>
  28 #include <linux/locks.h>
  29 #include <linux/errno.h>
  30 #include <linux/swap.h>
  31 #include <linux/swapctl.h>
  32 #include <linux/smp_lock.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/blkdev.h>
  35 #include <linux/sysrq.h>
  36 #include <linux/file.h>
  37 #include <linux/init.h>
  38 #include <linux/quotaops.h>
  39
  40 #include <asm/uaccess.h>
  41 #include <asm/io.h>
  42 #include <asm/bitops.h>
  43
  44 #define NR_SIZES 7
  45 static char buffersize_index[65] =
  46 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  47   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  48   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  49  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  50   6};
  51
  52 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  53 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  54 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  55 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  56                                              number of unused buffer heads */
  57
  58 /*
  59  * Hash table mask..
  60  */
  61 static unsigned long bh_hash_mask = 0;
  62
  63 static int grow_buffers(int size);
  64
  65 static struct buffer_head ** hash_table;
  66 static struct buffer_head * lru_list[NR_LIST] = {NULL, };
  67 static struct buffer_head * free_list[NR_SIZES] = {NULL, };
  68
  69 static kmem_cache_t *bh_cachep;
  70
  71 static struct buffer_head * unused_list = NULL;
  72 static struct buffer_head * reuse_list = NULL;
  73 static struct wait_queue * buffer_wait = NULL;
  74
  75 static int nr_buffers = 0;
  76 static int nr_buffers_type[NR_LIST] = {0,};
  77 static int nr_buffer_heads = 0;
  78 static int nr_unused_buffer_heads = 0;
  79 static int nr_hashed_buffers = 0;
  80
  81 /* This is used by some architectures to estimate available memory. */
  82 int buffermem = 0;
  83
  84 /* Here is the parameter block for the bdflush process. If you add or
  85  * remove any of the parameters, make sure to update kernel/sysctl.c.
  86  */
  87
  88 #define N_PARAM 9
  89
  90 /* The dummy values in this structure are left in there for compatibility
  91  * with old programs that play with the /proc entries.
  92  */
  93 union bdflush_param{
  94         struct {
  95                 int nfract;  /* Percentage of buffer cache dirty to
  96                                 activate bdflush */
  97                 int ndirty;  /* Maximum number of dirty blocks to write out per
  98                                 wake-cycle */
  99                 int nrefill; /* Number of clean buffers to try to obtain
 100                                 each time we call refill */
 101                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 102                                   when trying to refill buffers. */
 103                 int interval;  /* Interval (seconds) between spontaneous
 104                                   bdflush runs */
 105                 int age_buffer;  /* Time for normal buffer to age before
 106                                     we flush it */
 107                 int age_super;  /* Time for superblock to age before we
 108                                    flush it */
 109                 int dummy2;    /* unused */
 110                 int dummy3;    /* unused */
 111         } b_un;
 112         unsigned int data[N_PARAM];
 113 } bdf_prm = {{40, 500, 64, 256, 5, 30*HZ, 5*HZ, 1884, 2}};
 114
 115 /* These are the min and max parameter values that we will allow to be assigned */
 116 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  1,   1*HZ,   1*HZ, 1, 1};
 117 int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
 118
 119 void wakeup_bdflush(int);
 120
 121 /*
 122  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 123  * and getting rid of the cli-sti pairs. The wait-queue routines still
 124  * need cli-sti, but now it's just a couple of 386 instructions or so.
 125  *
 126  * Note that the real wait_on_buffer() is an inline function that checks
 127  * if 'b_wait' is set before calling this, so that the queues aren't set
 128  * up unnecessarily.
 129  */
 130 void __wait_on_buffer(struct buffer_head * bh)
 131 {
 132         struct task_struct *tsk = current;
 133         struct wait_queue wait;
 134
 135         bh->b_count++;
 136         wait.task = tsk;
 137         add_wait_queue(&bh->b_wait, &wait);
 138 repeat:
 139         tsk->state = TASK_UNINTERRUPTIBLE;
 140         run_task_queue(&tq_disk);
 141         if (buffer_locked(bh)) {
 142                 schedule();
 143                 goto repeat;
 144         }
 145         tsk->state = TASK_RUNNING;
 146         remove_wait_queue(&bh->b_wait, &wait);
 147         bh->b_count--;
 148 }
 149
 150 /* Call sync_buffers with wait!=0 to ensure that the call does not
 151  * return until all buffer writes have completed.  Sync() may return
 152  * before the writes have finished; fsync() may not.
 153  */
 154
 155 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 156  * spontaneously dirty themselves without ever brelse being called.
 157  * We will ultimately want to put these in a separate list, but for
 158  * now we search all of the lists for dirty buffers.
 159  */
 160 static int sync_buffers(kdev_t dev, int wait)
 161 {
 162         int i, retry, pass = 0, err = 0;
 163         struct buffer_head * bh, *next;
 164
 165         /* One pass for no-wait, three for wait:
 166          * 0) write out all dirty, unlocked buffers;
 167          * 1) write out all dirty buffers, waiting if locked;
 168          * 2) wait for completion by waiting for all buffers to unlock.
 169          */
 170         do {
 171                 retry = 0;
 172 repeat:
 173                 /* We search all lists as a failsafe mechanism, not because we expect
 174                  * there to be dirty buffers on any of the other lists.
 175                  */
 176                 bh = lru_list[BUF_DIRTY];
 177                 if (!bh)
 178                         goto repeat2;
 179                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 180                         if (bh->b_list != BUF_DIRTY)
 181                                 goto repeat;
 182                         next = bh->b_next_free;
 183                         if (!lru_list[BUF_DIRTY])
 184                                 break;
 185                         if (dev && bh->b_dev != dev)
 186                                 continue;
 187                         if (buffer_locked(bh)) {
 188                                 /* Buffer is locked; skip it unless wait is
 189                                  * requested AND pass > 0.
 190                                  */
 191                                 if (!wait || !pass) {
 192                                         retry = 1;
 193                                         continue;
 194                                 }
 195                                 wait_on_buffer (bh);
 196                                 goto repeat;
 197                         }
 198
 199                         /* If an unlocked buffer is not uptodate, there has
 200                          * been an IO error. Skip it.
 201                          */
 202                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 203                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 204                                 err = -EIO;
 205                                 continue;
 206                         }
 207
 208                         /* Don't write clean buffers.  Don't write ANY buffers
 209                          * on the third pass.
 210                          */
 211                         if (!buffer_dirty(bh) || pass >= 2)
 212                                 continue;
 213
 214                         /* Don't bother about locked buffers.
 215                          *
 216                          * XXX We checked if it was locked above and there is no
 217                          * XXX way we could have slept in between. -DaveM
 218                          */
 219                         if (buffer_locked(bh))
 220                                 continue;
 221                         bh->b_count++;
 222                         next->b_count++;
 223                         bh->b_flushtime = 0;
 224                         ll_rw_block(WRITE, 1, &bh);
 225                         bh->b_count--;
 226                         next->b_count--;
 227                         retry = 1;
 228                 }
 229
 230     repeat2:
 231                 bh = lru_list[BUF_LOCKED];
 232                 if (!bh)
 233                         break;
 234                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 235                         if (bh->b_list != BUF_LOCKED)
 236                                 goto repeat2;
 237                         next = bh->b_next_free;
 238                         if (!lru_list[BUF_LOCKED])
 239                                 break;
 240                         if (dev && bh->b_dev != dev)
 241                                 continue;
 242                         if (buffer_locked(bh)) {
 243                                 /* Buffer is locked; skip it unless wait is
 244                                  * requested AND pass > 0.
 245                                  */
 246                                 if (!wait || !pass) {
 247                                         retry = 1;
 248                                         continue;
 249                                 }
 250                                 wait_on_buffer (bh);
 251                                 goto repeat2;
 252                         }
 253                 }
 254
 255                 /* If we are waiting for the sync to succeed, and if any dirty
 256                  * blocks were written, then repeat; on the second pass, only
 257                  * wait for buffers being written (do not pass to write any
 258                  * more buffers on the second pass).
 259                  */
 260         } while (wait && retry && ++pass<=2);
 261         return err;
 262 }
 263
 264 void sync_dev(kdev_t dev)
 265 {
 266         sync_buffers(dev, 0);
 267         sync_supers(dev);
 268         sync_inodes(dev);
 269         sync_buffers(dev, 0);
 270         DQUOT_SYNC(dev);
 271         /*
 272          * FIXME(eric) we need to sync the physical devices here.
 273          * This is because some (scsi) controllers have huge amounts of
 274          * cache onboard (hundreds of Mb), and we need to instruct
 275          * them to commit all of the dirty memory to disk, and we should
 276          * not return until this has happened.
 277          *
 278          * This would need to get implemented by going through the assorted
 279          * layers so that each block major number can be synced, and this
 280          * would call down into the upper and mid-layer scsi.
 281          */
 282 }
 283
 284 int fsync_dev(kdev_t dev)
 285 {
 286         sync_buffers(dev, 0);
 287         sync_supers(dev);
 288         sync_inodes(dev);
 289         DQUOT_SYNC(dev);
 290         return sync_buffers(dev, 1);
 291 }
 292
 293 asmlinkage int sys_sync(void)
 294 {
 295         lock_kernel();
 296         fsync_dev(0);
 297         unlock_kernel();
 298         return 0;
 299 }
 300
 301 /*
 302  *      filp may be NULL if called via the msync of a vma.
 303  */
 304
 305 int file_fsync(struct file *filp, struct dentry *dentry)
 306 {
 307         struct inode * inode = dentry->d_inode;
 308         struct super_block * sb;
 309         kdev_t dev;
 310
 311         /* sync the inode to buffers */
 312         write_inode_now(inode);
 313
 314         /* sync the superblock to buffers */
 315         sb = inode->i_sb;
 316         wait_on_super(sb);
 317         if (sb->s_op && sb->s_op->write_super)
 318                 sb->s_op->write_super(sb);
 319
 320         /* .. finally sync the buffers to disk */
 321         dev = inode->i_dev;
 322         return sync_buffers(dev, 1);
 323 }
 324
 325 asmlinkage int sys_fsync(unsigned int fd)
 326 {
 327         struct file * file;
 328         struct dentry * dentry;
 329         struct inode * inode;
 330         int err;
 331
 332         lock_kernel();
 333         err = -EBADF;
 334         file = fget(fd);
 335         if (!file)
 336                 goto out;
 337
 338         dentry = file->f_dentry;
 339         if (!dentry)
 340                 goto out_putf;
 341
 342         inode = dentry->d_inode;
 343         if (!inode)
 344                 goto out_putf;
 345
 346         err = -EINVAL;
 347         if (!file->f_op || !file->f_op->fsync)
 348                 goto out_putf;
 349
 350         /* We need to protect against concurrent writers.. */
 351         down(&inode->i_sem);
 352         err = file->f_op->fsync(file, dentry);
 353         up(&inode->i_sem);
 354
 355 out_putf:
 356         fput(file);
 357 out:
 358         unlock_kernel();
 359         return err;
 360 }
 361
 362 asmlinkage int sys_fdatasync(unsigned int fd)
 363 {
 364         struct file * file;
 365         struct dentry * dentry;
 366         struct inode * inode;
 367         int err;
 368
 369         lock_kernel();
 370         err = -EBADF;
 371         file = fget(fd);
 372         if (!file)
 373                 goto out;
 374
 375         dentry = file->f_dentry;
 376         if (!dentry)
 377                 goto out_putf;
 378
 379         inode = dentry->d_inode;
 380         if (!inode)
 381                 goto out_putf;
 382
 383         err = -EINVAL;
 384         if (!file->f_op || !file->f_op->fsync)
 385                 goto out_putf;
 386
 387         /* this needs further work, at the moment it is identical to fsync() */
 388         down(&inode->i_sem);
 389         err = file->f_op->fsync(file, dentry);
 390         up(&inode->i_sem);
 391
 392 out_putf:
 393         fput(file);
 394 out:
 395         unlock_kernel();
 396         return err;
 397 }
 398
 399 void invalidate_buffers(kdev_t dev)
 400 {
 401         int i;
 402         int nlist;
 403         struct buffer_head * bh;
 404
 405         for(nlist = 0; nlist < NR_LIST; nlist++) {
 406                 bh = lru_list[nlist];
 407                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 408                         if (bh->b_dev != dev)
 409                                 continue;
 410                         wait_on_buffer(bh);
 411                         if (bh->b_dev != dev)
 412                                 continue;
 413                         if (bh->b_count)
 414                                 continue;
 415                         bh->b_flushtime = 0;
 416                         clear_bit(BH_Protected, &bh->b_state);
 417                         clear_bit(BH_Uptodate, &bh->b_state);
 418                         clear_bit(BH_Dirty, &bh->b_state);
 419                         clear_bit(BH_Req, &bh->b_state);
 420                 }
 421         }
 422 }
 423
 424 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
 425 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 426
 427 static inline void remove_from_hash_queue(struct buffer_head * bh)
 428 {
 429         struct buffer_head **pprev = bh->b_pprev;
 430         if (pprev) {
 431                 struct buffer_head * next = bh->b_next;
 432                 if (next) {
 433                         next->b_pprev = pprev;
 434                         bh->b_next = NULL;
 435                 }
 436                 *pprev = next;
 437                 bh->b_pprev = NULL;
 438         }
 439         nr_hashed_buffers--;
 440 }
 441
 442 static inline void remove_from_lru_list(struct buffer_head * bh)
 443 {
 444         if (!(bh->b_prev_free) || !(bh->b_next_free))
 445                 panic("VFS: LRU block list corrupted");
 446         if (bh->b_dev == B_FREE)
 447                 panic("LRU list corrupted");
 448         bh->b_prev_free->b_next_free = bh->b_next_free;
 449         bh->b_next_free->b_prev_free = bh->b_prev_free;
 450
 451         if (lru_list[bh->b_list] == bh)
 452                  lru_list[bh->b_list] = bh->b_next_free;
 453         if (lru_list[bh->b_list] == bh)
 454                  lru_list[bh->b_list] = NULL;
 455         bh->b_next_free = bh->b_prev_free = NULL;
 456 }
 457
 458 static inline void remove_from_free_list(struct buffer_head * bh)
 459 {
 460         int isize = BUFSIZE_INDEX(bh->b_size);
 461         if (!(bh->b_prev_free) || !(bh->b_next_free))
 462                 panic("VFS: Free block list corrupted");
 463         if(bh->b_dev != B_FREE)
 464                 panic("Free list corrupted");
 465         if(!free_list[isize])
 466                 panic("Free list empty");
 467         if(bh->b_next_free == bh)
 468                  free_list[isize] = NULL;
 469         else {
 470                 bh->b_prev_free->b_next_free = bh->b_next_free;
 471                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 472                 if (free_list[isize] == bh)
 473                          free_list[isize] = bh->b_next_free;
 474         }
 475         bh->b_next_free = bh->b_prev_free = NULL;
 476 }
 477
 478 static void remove_from_queues(struct buffer_head * bh)
 479 {
 480         if(bh->b_dev == B_FREE) {
 481                 remove_from_free_list(bh); /* Free list entries should not be
 482                                               in the hash queue */
 483                 return;
 484         }
 485         nr_buffers_type[bh->b_list]--;
 486         remove_from_hash_queue(bh);
 487         remove_from_lru_list(bh);
 488 }
 489
 490 static inline void put_last_free(struct buffer_head * bh)
 491 {
 492         if (bh) {
 493                 struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 494
 495                 bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 496
 497                 /* Add to back of free list. */
 498                 if(!*bhp) {
 499                         *bhp = bh;
 500                         bh->b_prev_free = bh;
 501                 }
 502
 503                 bh->b_next_free = *bhp;
 504                 bh->b_prev_free = (*bhp)->b_prev_free;
 505                 (*bhp)->b_prev_free->b_next_free = bh;
 506                 (*bhp)->b_prev_free = bh;
 507         }
 508 }
 509
 510 static void insert_into_queues(struct buffer_head * bh)
 511 {
 512         /* put at end of free list */
 513         if(bh->b_dev == B_FREE) {
 514                 put_last_free(bh);
 515         } else {
 516                 struct buffer_head **bhp = &lru_list[bh->b_list];
 517
 518                 if(!*bhp) {
 519                         *bhp = bh;
 520                         bh->b_prev_free = bh;
 521                 }
 522
 523                 if (bh->b_next_free)
 524                         panic("VFS: buffer LRU pointers corrupted");
 525
 526                 bh->b_next_free = *bhp;
 527                 bh->b_prev_free = (*bhp)->b_prev_free;
 528                 (*bhp)->b_prev_free->b_next_free = bh;
 529                 (*bhp)->b_prev_free = bh;
 530
 531                 nr_buffers_type[bh->b_list]++;
 532
 533                 /* Put the buffer in new hash-queue if it has a device. */
 534                 bh->b_next = NULL;
 535                 bh->b_pprev = NULL;
 536                 if (bh->b_dev) {
 537                         struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
 538                         struct buffer_head *next = *bhp;
 539
 540                         if (next) {
 541                                 bh->b_next = next;
 542                                 next->b_pprev = &bh->b_next;
 543                         }
 544                         *bhp = bh;
 545                         bh->b_pprev = bhp;
 546                 }
 547                 nr_hashed_buffers++;
 548         }
 549 }
 550
 551 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 552 {
 553         struct buffer_head * next;
 554
 555         next = hash(dev,block);
 556         for (;;) {
 557                 struct buffer_head *tmp = next;
 558                 if (!next)
 559                         break;
 560                 next = tmp->b_next;
 561                 if (tmp->b_blocknr != block || tmp->b_size != size || tmp->b_dev != dev)
 562                         continue;
 563                 next = tmp;
 564                 break;
 565         }
 566         return next;
 567 }
 568
 569 /*
 570  * Why like this, I hear you say... The reason is race-conditions.
 571  * As we don't lock buffers (unless we are reading them, that is),
 572  * something might happen to it while we sleep (ie a read-error
 573  * will force it bad). This shouldn't really happen currently, but
 574  * the code is ready.
 575  */
 576 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 577 {
 578         struct buffer_head * bh;
 579         bh = find_buffer(dev,block,size);
 580         if (bh)
 581                 bh->b_count++;
 582         return bh;
 583 }
 584
 585 unsigned int get_hardblocksize(kdev_t dev)
 586 {
 587         /*
 588          * Get the hard sector size for the given device.  If we don't know
 589          * what it is, return 0.
 590          */
 591         if (hardsect_size[MAJOR(dev)] != NULL) {
 592                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 593                 if (blksize != 0)
 594                         return blksize;
 595         }
 596
 597         /*
 598          * We don't know what the hardware sector size for this device is.
 599          * Return 0 indicating that we don't know.
 600          */
 601         return 0;
 602 }
 603
 604 void set_blocksize(kdev_t dev, int size)
 605 {
 606         extern int *blksize_size[];
 607         int i, nlist;
 608         struct buffer_head * bh, *bhnext;
 609
 610         if (!blksize_size[MAJOR(dev)])
 611                 return;
 612
 613         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 614         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 615                 panic("Invalid blocksize passed to set_blocksize");
 616
 617         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 618                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 619                 return;
 620         }
 621         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 622                 return;
 623         sync_buffers(dev, 2);
 624         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 625
 626         /* We need to be quite careful how we do this - we are moving entries
 627          * around on the free list, and we can get in a loop if we are not careful.
 628          */
 629         for(nlist = 0; nlist < NR_LIST; nlist++) {
 630                 bh = lru_list[nlist];
 631                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 632                         if(!bh)
 633                                 break;
 634
 635                         bhnext = bh->b_next_free;
 636                         if (bh->b_dev != dev)
 637                                  continue;
 638                         if (bh->b_size == size)
 639                                  continue;
 640                         bhnext->b_count++;
 641                         wait_on_buffer(bh);
 642                         bhnext->b_count--;
 643                         if (bh->b_dev == dev && bh->b_size != size) {
 644                                 clear_bit(BH_Dirty, &bh->b_state);
 645                                 clear_bit(BH_Uptodate, &bh->b_state);
 646                                 clear_bit(BH_Req, &bh->b_state);
 647                                 bh->b_flushtime = 0;
 648                         }
 649                         remove_from_hash_queue(bh);
 650                 }
 651         }
 652 }
 653
 654 /*
 655  * We used to try various strange things. Let's not.
 656  */
 657 static void refill_freelist(int size)
 658 {
 659         if (!grow_buffers(size)) {
 660                 wakeup_bdflush(1);
 661                 current->policy |= SCHED_YIELD;
 662                 schedule();
 663         }
 664 }
 665
 666 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 667                  bh_end_io_t *handler, void *dev_id)
 668 {
 669         bh->b_count = 1;
 670         bh->b_list = BUF_CLEAN;
 671         bh->b_flushtime = 0;
 672         bh->b_dev = dev;
 673         bh->b_blocknr = block;
 674         bh->b_end_io = handler;
 675         bh->b_dev_id = dev_id;
 676 }
 677
 678 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 679 {
 680         mark_buffer_uptodate(bh, uptodate);
 681         unlock_buffer(bh);
 682 }
 683
 684 /*
 685  * Ok, this is getblk, and it isn't very clear, again to hinder
 686  * race-conditions. Most of the code is seldom used, (ie repeating),
 687  * so it should be much more efficient than it looks.
 688  *
 689  * The algorithm is changed: hopefully better, and an elusive bug removed.
 690  *
 691  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 692  * when the filesystem starts to get full of dirty blocks (I hope).
 693  */
 694 struct buffer_head * getblk(kdev_t dev, int block, int size)
 695 {
 696         struct buffer_head * bh;
 697         int isize;
 698
 699 repeat:
 700         bh = get_hash_table(dev, block, size);
 701         if (bh) {
 702                 if (!buffer_dirty(bh)) {
 703                         bh->b_flushtime = 0;
 704                 }
 705                 return bh;
 706         }
 707
 708         isize = BUFSIZE_INDEX(size);
 709 get_free:
 710         bh = free_list[isize];
 711         if (!bh)
 712                 goto refill;
 713         remove_from_free_list(bh);
 714
 715         /* OK, FINALLY we know that this buffer is the only one of its kind,
 716          * and that it's unused (b_count=0), unlocked, and clean.
 717          */
 718         init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
 719         bh->b_state=0;
 720         insert_into_queues(bh);
 721         return bh;
 722
 723         /*
 724          * If we block while refilling the free list, somebody may
 725          * create the buffer first ... search the hashes again.
 726          */
 727 refill:
 728         refill_freelist(size);
 729         if (!find_buffer(dev,block,size))
 730                 goto get_free;
 731         goto repeat;
 732 }
 733
 734 void set_writetime(struct buffer_head * buf, int flag)
 735 {
 736         int newtime;
 737
 738         if (buffer_dirty(buf)) {
 739                 /* Move buffer to dirty list if jiffies is clear. */
 740                 newtime = jiffies + (flag ? bdf_prm.b_un.age_super :
 741                                      bdf_prm.b_un.age_buffer);
 742                 if(!buf->b_flushtime || buf->b_flushtime > newtime)
 743                          buf->b_flushtime = newtime;
 744         } else {
 745                 buf->b_flushtime = 0;
 746         }
 747 }
 748
 749
 750 /*
 751  * Put a buffer into the appropriate list, without side-effects.
 752  */
 753 static inline void file_buffer(struct buffer_head *bh, int list)
 754 {
 755         remove_from_queues(bh);
 756         bh->b_list = list;
 757         insert_into_queues(bh);
 758 }
 759
 760 /*
 761  * A buffer may need to be moved from one buffer list to another
 762  * (e.g. in case it is not shared any more). Handle this.
 763  */
 764 void refile_buffer(struct buffer_head * buf)
 765 {
 766         int dispose;
 767
 768         if(buf->b_dev == B_FREE) {
 769                 printk("Attempt to refile free buffer\n");
 770                 return;
 771         }
 772         if (buffer_dirty(buf))
 773                 dispose = BUF_DIRTY;
 774         else if (buffer_locked(buf))
 775                 dispose = BUF_LOCKED;
 776         else
 777                 dispose = BUF_CLEAN;
 778         if(dispose != buf->b_list) {
 779                 file_buffer(buf, dispose);
 780                 if(dispose == BUF_DIRTY) {
 781                         int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
 782
 783                         /* This buffer is dirty, maybe we need to start flushing.
 784                          * If too high a percentage of the buffers are dirty...
 785                          */
 786                         if (nr_buffers_type[BUF_DIRTY] > too_many)
 787                                 wakeup_bdflush(1);
 788
 789                         /* If this is a loop device, and
 790                          * more than half of the buffers are dirty...
 791                          * (Prevents no-free-buffers deadlock with loop device.)
 792                          */
 793                         if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
 794                             nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
 795                                 wakeup_bdflush(1);
 796                 }
 797         }
 798 }
 799
 800 /*
 801  * Release a buffer head
 802  */
 803 void __brelse(struct buffer_head * buf)
 804 {
 805         /* If dirty, mark the time this buffer should be written back. */
 806         set_writetime(buf, 0);
 807         refile_buffer(buf);
 808         touch_buffer(buf);
 809
 810         if (buf->b_count) {
 811                 buf->b_count--;
 812                 return;
 813         }
 814         printk("VFS: brelse: Trying to free free buffer\n");
 815 }
 816
 817 /*
 818  * bforget() is like brelse(), except it puts the buffer on the
 819  * free list if it can.. We can NOT free the buffer if:
 820  *  - there are other users of it
 821  *  - it is locked and thus can have active IO
 822  */
 823 void __bforget(struct buffer_head * buf)
 824 {
 825         if (buf->b_count != 1 || buffer_locked(buf)) {
 826                 __brelse(buf);
 827                 return;
 828         }
 829         buf->b_count = 0;
 830         buf->b_state = 0;
 831         remove_from_queues(buf);
 832         put_last_free(buf);
 833 }
 834
 835 /*
 836  * bread() reads a specified block and returns the buffer that contains
 837  * it. It returns NULL if the block was unreadable.
 838  */
 839 struct buffer_head * bread(kdev_t dev, int block, int size)
 840 {
 841         struct buffer_head * bh;
 842
 843         bh = getblk(dev, block, size);
 844         if (buffer_uptodate(bh))
 845                 return bh;
 846         ll_rw_block(READ, 1, &bh);
 847         wait_on_buffer(bh);
 848         if (buffer_uptodate(bh))
 849                 return bh;
 850         brelse(bh);
 851         return NULL;
 852 }
 853
 854 /*
 855  * Ok, breada can be used as bread, but additionally to mark other
 856  * blocks for reading as well. End the argument list with a negative
 857  * number.
 858  */
 859
 860 #define NBUF 16
 861
 862 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 863         unsigned int pos, unsigned int filesize)
 864 {
 865         struct buffer_head * bhlist[NBUF];
 866         unsigned int blocks;
 867         struct buffer_head * bh;
 868         int index;
 869         int i, j;
 870
 871         if (pos >= filesize)
 872                 return NULL;
 873
 874         if (block < 0)
 875                 return NULL;
 876
 877         bh = getblk(dev, block, bufsize);
 878         index = BUFSIZE_INDEX(bh->b_size);
 879
 880         if (buffer_uptodate(bh))
 881                 return(bh);
 882         else ll_rw_block(READ, 1, &bh);
 883
 884         blocks = (filesize - pos) >> (9+index);
 885
 886         if (blocks < (read_ahead[MAJOR(dev)] >> index))
 887                 blocks = read_ahead[MAJOR(dev)] >> index;
 888         if (blocks > NBUF)
 889                 blocks = NBUF;
 890
 891 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 892
 893
 894         bhlist[0] = bh;
 895         j = 1;
 896         for(i=1; i<blocks; i++) {
 897                 bh = getblk(dev,block+i,bufsize);
 898                 if (buffer_uptodate(bh)) {
 899                         brelse(bh);
 900                         break;
 901                 }
 902                 else bhlist[j++] = bh;
 903         }
 904
 905         /* Request the read for these buffers, and then release them. */
 906         if (j>1)
 907                 ll_rw_block(READA, (j-1), bhlist+1);
 908         for(i=1; i<j; i++)
 909                 brelse(bhlist[i]);
 910
 911         /* Wait for this buffer, and then continue on. */
 912         bh = bhlist[0];
 913         wait_on_buffer(bh);
 914         if (buffer_uptodate(bh))
 915                 return bh;
 916         brelse(bh);
 917         return NULL;
 918 }
 919
 920 /*
 921  * Note: the caller should wake up the buffer_wait list if needed.
 922  */
 923 static void put_unused_buffer_head(struct buffer_head * bh)
 924 {
 925         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
 926                 nr_buffer_heads--;
 927                 kmem_cache_free(bh_cachep, bh);
 928                 return;
 929         }
 930
 931         memset(bh,0,sizeof(*bh));
 932         nr_unused_buffer_heads++;
 933         bh->b_next_free = unused_list;
 934         unused_list = bh;
 935 }
 936
 937 /*
 938  * We can't put completed temporary IO buffer_heads directly onto the
 939  * unused_list when they become unlocked, since the device driver
 940  * end_request routines still expect access to the buffer_head's
 941  * fields after the final unlock.  So, the device driver puts them on
 942  * the reuse_list instead once IO completes, and we recover these to
 943  * the unused_list here.
 944  *
 945  * Note that we don't do a wakeup here, but return a flag indicating
 946  * whether we got any buffer heads. A task ready to sleep can check
 947  * the returned value, and any tasks already sleeping will have been
 948  * awakened when the buffer heads were added to the reuse list.
 949  */
 950 static inline int recover_reusable_buffer_heads(void)
 951 {
 952         struct buffer_head *head = xchg(&reuse_list, NULL);
 953         int found = 0;
 954
 955         if (head) {
 956                 do {
 957                         struct buffer_head *bh = head;
 958                         head = head->b_next_free;
 959                         put_unused_buffer_head(bh);
 960                 } while (head);
 961                 found = 1;
 962         }
 963         return found;
 964 }
 965
 966 /*
 967  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
 968  * no-buffer-head deadlock.  Return NULL on failure; waiting for
 969  * buffer heads is now handled in create_buffers().
 970  */
 971 static struct buffer_head * get_unused_buffer_head(int async)
 972 {
 973         struct buffer_head * bh;
 974
 975         recover_reusable_buffer_heads();
 976         if (nr_unused_buffer_heads > NR_RESERVED) {
 977                 bh = unused_list;
 978                 unused_list = bh->b_next_free;
 979                 nr_unused_buffer_heads--;
 980                 return bh;
 981         }
 982
 983         /* This is critical.  We can't swap out pages to get
 984          * more buffer heads, because the swap-out may need
 985          * more buffer-heads itself.  Thus SLAB_BUFFER.
 986          */
 987         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
 988                 memset(bh, 0, sizeof(*bh));
 989                 nr_buffer_heads++;
 990                 return bh;
 991         }
 992
 993         /*
 994          * If we need an async buffer, use the reserved buffer heads.
 995          */
 996         if (async && unused_list) {
 997                 bh = unused_list;
 998                 unused_list = bh->b_next_free;
 999                 nr_unused_buffer_heads--;
1000                 return bh;
1001         }
1002
1003 #if 0
1004         /*
1005          * (Pending further analysis ...)
1006          * Ordinary (non-async) requests can use a different memory priority
1007          * to free up pages. Any swapping thus generated will use async
1008          * buffer heads.
1009          */
1010         if(!async &&
1011            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1012                 memset(bh, 0, sizeof(*bh));
1013                 nr_buffer_heads++;
1014                 return bh;
1015         }
1016 #endif
1017
1018         return NULL;
1019 }
1020
1021 /*
1022  * Create the appropriate buffers when given a page for data area and
1023  * the size of each buffer.. Use the bh->b_this_page linked list to
1024  * follow the buffers created.  Return NULL if unable to create more
1025  * buffers.
1026  * The async flag is used to differentiate async IO (paging, swapping)
1027  * from ordinary buffer allocations, and only async requests are allowed
1028  * to sleep waiting for buffer heads.
1029  */
1030 static struct buffer_head * create_buffers(unsigned long page,
1031                                                 unsigned long size, int async)
1032 {
1033         struct wait_queue wait = { current, NULL };
1034         struct buffer_head *bh, *head;
1035         long offset;
1036
1037 try_again:
1038         head = NULL;
1039         offset = PAGE_SIZE;
1040         while ((offset -= size) >= 0) {
1041                 bh = get_unused_buffer_head(async);
1042                 if (!bh)
1043                         goto no_grow;
1044
1045                 bh->b_dev = B_FREE;  /* Flag as unused */
1046                 bh->b_this_page = head;
1047                 head = bh;
1048
1049                 bh->b_state = 0;
1050                 bh->b_next_free = NULL;
1051                 bh->b_count = 0;
1052                 bh->b_size = size;
1053
1054                 bh->b_data = (char *) (page+offset);
1055                 bh->b_list = 0;
1056         }
1057         return head;
1058 /*
1059  * In case anything failed, we just free everything we got.
1060  */
1061 no_grow:
1062         if (head) {
1063                 do {
1064                         bh = head;
1065                         head = head->b_this_page;
1066                         put_unused_buffer_head(bh);
1067                 } while (head);
1068
1069                 /* Wake up any waiters ... */
1070                 wake_up(&buffer_wait);
1071         }
1072
1073         /*
1074          * Return failure for non-async IO requests.  Async IO requests
1075          * are not allowed to fail, so we have to wait until buffer heads
1076          * become available.  But we don't want tasks sleeping with
1077          * partially complete buffers, so all were released above.
1078          */
1079         if (!async)
1080                 return NULL;
1081
1082         /* We're _really_ low on memory. Now we just
1083          * wait for old buffer heads to become free due to
1084          * finishing IO.  Since this is an async request and
1085          * the reserve list is empty, we're sure there are
1086          * async buffer heads in use.
1087          */
1088         run_task_queue(&tq_disk);
1089
1090         /*
1091          * Set our state for sleeping, then check again for buffer heads.
1092          * This ensures we won't miss a wake_up from an interrupt.
1093          */
1094         add_wait_queue(&buffer_wait, &wait);
1095         current->state = TASK_UNINTERRUPTIBLE;
1096         if (!recover_reusable_buffer_heads())
1097                 schedule();
1098         remove_wait_queue(&buffer_wait, &wait);
1099         current->state = TASK_RUNNING;
1100         goto try_again;
1101 }
1102
1103 /* Run the hooks that have to be done when a page I/O has completed. */
1104 static inline void after_unlock_page (struct page * page)
1105 {
1106         if (test_and_clear_bit(PG_decr_after, &page->flags)) {
1107                 atomic_dec(&nr_async_pages);
1108 #ifdef DEBUG_SWAP
1109                 printk ("DebugVM: Finished IO on page %p, nr_async_pages %d\n",
1110                         (char *) page_address(page),
1111                         atomic_read(&nr_async_pages));
1112 #endif
1113         }
1114         if (test_and_clear_bit(PG_swap_unlock_after, &page->flags))
1115                 swap_after_unlock_page(page->offset);
1116         if (test_and_clear_bit(PG_free_after, &page->flags))
1117                 __free_page(page);
1118 }
1119
1120 /*
1121  * Free all temporary buffers belonging to a page.
1122  * This needs to be called with interrupts disabled.
1123  */
1124 static inline void free_async_buffers (struct buffer_head * bh)
1125 {
1126         struct buffer_head *tmp, *tail;
1127
1128         /*
1129          * Link all the buffers into the b_next_free list,
1130          * so we only have to do one xchg() operation ...
1131          */
1132         tail = bh;
1133         while ((tmp = tail->b_this_page) != bh) {
1134                 tail->b_next_free = tmp;
1135                 tail = tmp;
1136         };
1137
1138         /* Update the reuse list */
1139         tail->b_next_free = xchg(&reuse_list, NULL);
1140         reuse_list = bh;
1141
1142         /* Wake up any waiters ... */
1143         wake_up(&buffer_wait);
1144 }
1145
1146 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
1147 {
1148         unsigned long flags;
1149         struct buffer_head *tmp;
1150         struct page *page;
1151
1152         mark_buffer_uptodate(bh, uptodate);
1153         unlock_buffer(bh);
1154
1155         /* This is a temporary buffer used for page I/O. */
1156         page = mem_map + MAP_NR(bh->b_data);
1157         if (!PageLocked(page))
1158                 goto not_locked;
1159         if (bh->b_count != 1)
1160                 goto bad_count;
1161
1162         if (!test_bit(BH_Uptodate, &bh->b_state))
1163                 set_bit(PG_error, &page->flags);
1164
1165         /*
1166          * Be _very_ careful from here on. Bad things can happen if
1167          * two buffer heads end IO at almost the same time and both
1168          * decide that the page is now completely done.
1169          *
1170          * Async buffer_heads are here only as labels for IO, and get
1171          * thrown away once the IO for this page is complete.  IO is
1172          * deemed complete once all buffers have been visited
1173          * (b_count==0) and are now unlocked. We must make sure that
1174          * only the _last_ buffer that decrements its count is the one
1175          * that free's the page..
1176          */
1177         save_flags(flags);
1178         cli();
1179         bh->b_count--;
1180         tmp = bh;
1181         do {
1182                 if (tmp->b_count)
1183                         goto still_busy;
1184                 tmp = tmp->b_this_page;
1185         } while (tmp != bh);
1186
1187         /* OK, the async IO on this page is complete. */
1188         free_async_buffers(bh);
1189         restore_flags(flags);
1190         clear_bit(PG_locked, &page->flags);
1191         wake_up(&page->wait);
1192         after_unlock_page(page);
1193         return;
1194
1195 still_busy:
1196         restore_flags(flags);
1197         return;
1198
1199 not_locked:
1200         printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
1201         return;
1202
1203 bad_count:
1204         printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
1205         return;
1206 }
1207
1208 /*
1209  * Start I/O on a page.
1210  * This function expects the page to be locked and may return before I/O is complete.
1211  * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
1212  */
1213 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1214 {
1215         struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
1216         int block, nr;
1217
1218         if (!PageLocked(page))
1219                 panic("brw_page: page not locked for I/O");
1220         clear_bit(PG_uptodate, &page->flags);
1221         clear_bit(PG_error, &page->flags);
1222         /*
1223          * Allocate async buffer heads pointing to this page, just for I/O.
1224          * They do _not_ show up in the buffer hash table!
1225          * They are _not_ registered in page->buffers either!
1226          */
1227         bh = create_buffers(page_address(page), size, 1);
1228         if (!bh) {
1229                 /* WSH: exit here leaves page->count incremented */
1230                 clear_bit(PG_locked, &page->flags);
1231                 wake_up(&page->wait);
1232                 return -ENOMEM;
1233         }
1234         nr = 0;
1235         next = bh;
1236         do {
1237                 struct buffer_head * tmp;
1238                 block = *(b++);
1239
1240                 init_buffer(next, dev, block, end_buffer_io_async, NULL);
1241                 set_bit(BH_Uptodate, &next->b_state);
1242
1243                 /*
1244                  * When we use bmap, we define block zero to represent
1245                  * a hole.  ll_rw_page, however, may legitimately
1246                  * access block zero, and we need to distinguish the
1247                  * two cases.
1248                  */
1249                 if (bmap && !block) {
1250                         memset(next->b_data, 0, size);
1251                         next->b_count--;
1252                         continue;
1253                 }
1254                 tmp = get_hash_table(dev, block, size);
1255                 if (tmp) {
1256                         if (!buffer_uptodate(tmp)) {
1257                                 if (rw == READ)
1258                                         ll_rw_block(READ, 1, &tmp);
1259                                 wait_on_buffer(tmp);
1260                         }
1261                         if (rw == READ)
1262                                 memcpy(next->b_data, tmp->b_data, size);
1263                         else {
1264                                 memcpy(tmp->b_data, next->b_data, size);
1265                                 mark_buffer_dirty(tmp, 0);
1266                         }
1267                         brelse(tmp);
1268                         next->b_count--;
1269                         continue;
1270                 }
1271                 if (rw == READ)
1272                         clear_bit(BH_Uptodate, &next->b_state);
1273                 else
1274                         set_bit(BH_Dirty, &next->b_state);
1275                 arr[nr++] = next;
1276         } while (prev = next, (next = next->b_this_page) != NULL);
1277         prev->b_this_page = bh;
1278
1279         if (nr) {
1280                 ll_rw_block(rw, nr, arr);
1281                 /* The rest of the work is done in mark_buffer_uptodate()
1282                  * and unlock_buffer(). */
1283         } else {
1284                 unsigned long flags;
1285                 clear_bit(PG_locked, &page->flags);
1286                 set_bit(PG_uptodate, &page->flags);
1287                 wake_up(&page->wait);
1288                 save_flags(flags);
1289                 cli();
1290                 free_async_buffers(bh);
1291                 restore_flags(flags);
1292                 after_unlock_page(page);
1293         }
1294         ++current->maj_flt;
1295         return 0;
1296 }
1297
1298 /*
1299  * This is called by end_request() when I/O has completed.
1300  */
1301 void mark_buffer_uptodate(struct buffer_head * bh, int on)
1302 {
1303         if (on) {
1304                 struct buffer_head *tmp = bh;
1305                 set_bit(BH_Uptodate, &bh->b_state);
1306                 /* If a page has buffers and all these buffers are uptodate,
1307                  * then the page is uptodate. */
1308                 do {
1309                         if (!test_bit(BH_Uptodate, &tmp->b_state))
1310                                 return;
1311                         tmp=tmp->b_this_page;
1312                 } while (tmp && tmp != bh);
1313                 set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
1314                 return;
1315         }
1316         clear_bit(BH_Uptodate, &bh->b_state);
1317 }
1318
1319 /*
1320  * Generic "readpage" function for block devices that have the normal
1321  * bmap functionality. This is most of the block device filesystems.
1322  * Reads the page asynchronously --- the unlock_buffer() and
1323  * mark_buffer_uptodate() functions propagate buffer state into the
1324  * page struct once IO has completed.
1325  */
1326 int generic_readpage(struct file * file, struct page * page)
1327 {
1328         struct dentry *dentry = file->f_dentry;
1329         struct inode *inode = dentry->d_inode;
1330         unsigned long block;
1331         int *p, nr[PAGE_SIZE/512];
1332         int i;
1333
1334         atomic_inc(&page->count);
1335         set_bit(PG_locked, &page->flags);
1336         set_bit(PG_free_after, &page->flags);
1337
1338         i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1339         block = page->offset >> inode->i_sb->s_blocksize_bits;
1340         p = nr;
1341         do {
1342                 *p = inode->i_op->bmap(inode, block);
1343                 i--;
1344                 block++;
1345                 p++;
1346         } while (i > 0);
1347
1348         /* IO start */
1349         brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
1350         return 0;
1351 }
1352
1353 /*
1354  * Try to increase the number of buffers available: the size argument
1355  * is used to determine what kind of buffers we want.
1356  */
1357 static int grow_buffers(int size)
1358 {
1359         unsigned long page;
1360         struct buffer_head *bh, *tmp;
1361         struct buffer_head * insert_point;
1362         int isize;
1363
1364         if ((size & 511) || (size > PAGE_SIZE)) {
1365                 printk("VFS: grow_buffers: size = %d\n",size);
1366                 return 0;
1367         }
1368
1369         if (!(page = __get_free_page(GFP_BUFFER)))
1370                 return 0;
1371         bh = create_buffers(page, size, 0);
1372         if (!bh) {
1373                 free_page(page);
1374                 return 0;
1375         }
1376
1377         isize = BUFSIZE_INDEX(size);
1378         insert_point = free_list[isize];
1379
1380         tmp = bh;
1381         while (1) {
1382                 if (insert_point) {
1383                         tmp->b_next_free = insert_point->b_next_free;
1384                         tmp->b_prev_free = insert_point;
1385                         insert_point->b_next_free->b_prev_free = tmp;
1386                         insert_point->b_next_free = tmp;
1387                 } else {
1388                         tmp->b_prev_free = tmp;
1389                         tmp->b_next_free = tmp;
1390                 }
1391                 insert_point = tmp;
1392                 ++nr_buffers;
1393                 if (tmp->b_this_page)
1394                         tmp = tmp->b_this_page;
1395                 else
1396                         break;
1397         }
1398         tmp->b_this_page = bh;
1399         free_list[isize] = bh;
1400         mem_map[MAP_NR(page)].buffers = bh;
1401         buffermem += PAGE_SIZE;
1402         return 1;
1403 }
1404
1405 /*
1406  * Can the buffer be thrown out?
1407  */
1408 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1409 #define buffer_busy(bh)         ((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
1410
1411 /*
1412  * try_to_free_buffers() checks if all the buffers on this particular page
1413  * are unused, and free's the page if so.
1414  *
1415  * Wake up bdflush() if this fails - if we're running low on memory due
1416  * to dirty buffers, we need to flush them out as quickly as possible.
1417  */
1418 int try_to_free_buffers(struct page * page_map)
1419 {
1420         struct buffer_head * tmp, * bh = page_map->buffers;
1421
1422         tmp = bh;
1423         do {
1424                 struct buffer_head * p = tmp;
1425
1426                 tmp = tmp->b_this_page;
1427                 if (!buffer_busy(p))
1428                         continue;
1429
1430                 wakeup_bdflush(0);
1431                 return 0;
1432         } while (tmp != bh);
1433
1434         tmp = bh;
1435         do {
1436                 struct buffer_head * p = tmp;
1437                 tmp = tmp->b_this_page;
1438                 nr_buffers--;
1439                 remove_from_queues(p);
1440                 put_unused_buffer_head(p);
1441         } while (tmp != bh);
1442
1443         /* Wake up anyone waiting for buffer heads */
1444         wake_up(&buffer_wait);
1445
1446         /* And free the page */
1447         buffermem -= PAGE_SIZE;
1448         page_map->buffers = NULL;
1449         __free_page(page_map);
1450         return 1;
1451 }
1452
1453 /* ================== Debugging =================== */
1454
1455 void show_buffers(void)
1456 {
1457         struct buffer_head * bh;
1458         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
1459         int protected = 0;
1460         int nlist;
1461         static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
1462
1463         printk("Buffer memory:   %6dkB\n",buffermem>>10);
1464         printk("Buffer heads:    %6d\n",nr_buffer_heads);
1465         printk("Buffer blocks:   %6d\n",nr_buffers);
1466         printk("Buffer hashed:   %6d\n",nr_hashed_buffers);
1467
1468         for(nlist = 0; nlist < NR_LIST; nlist++) {
1469           found = locked = dirty = used = lastused = protected = 0;
1470           bh = lru_list[nlist];
1471           if(!bh) continue;
1472
1473           do {
1474                 found++;
1475                 if (buffer_locked(bh))
1476                         locked++;
1477                 if (buffer_protected(bh))
1478                         protected++;
1479                 if (buffer_dirty(bh))
1480                         dirty++;
1481                 if (bh->b_count)
1482                         used++, lastused = found;
1483                 bh = bh->b_next_free;
1484           } while (bh != lru_list[nlist]);
1485           printk("%8s: %d buffers, %d used (last=%d), "
1486                  "%d locked, %d protected, %d dirty\n",
1487                  buf_types[nlist], found, used, lastused,
1488                  locked, protected, dirty);
1489         };
1490 }
1491
1492
1493 /* ===================== Init ======================= */
1494
1495 /*
1496  * allocate the hash table and init the free list
1497  * Use gfp() for the hash table to decrease TLB misses, use
1498  * SLAB cache for buffer heads.
1499  */
1500 void __init buffer_init(unsigned long memory_size)
1501 {
1502         int order;
1503         unsigned int nr_hash;
1504
1505         /* we need to guess at the right sort of size for a buffer cache.
1506            the heuristic from working with large databases and getting
1507            fsync times (ext2) manageable, is the following */
1508
1509         memory_size >>= 20;
1510         for (order = 5; (1UL << order) < memory_size; order++);
1511
1512         /* try to allocate something until we get it or we're asking
1513            for something that is really too small */
1514
1515         do {
1516                 nr_hash = (1UL << order) * PAGE_SIZE /
1517                     sizeof(struct buffer_head *);
1518                 hash_table = (struct buffer_head **)
1519                     __get_free_pages(GFP_ATOMIC, order);
1520         } while (hash_table == NULL && --order > 4);
1521
1522         if (!hash_table)
1523                 panic("Failed to allocate buffer hash table\n");
1524         memset(hash_table, 0, nr_hash * sizeof(struct buffer_head *));
1525         bh_hash_mask = nr_hash-1;
1526
1527         bh_cachep = kmem_cache_create("buffer_head",
1528                                       sizeof(struct buffer_head),
1529                                       0,
1530                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1531         if(!bh_cachep)
1532                 panic("Cannot create buffer head SLAB cache\n");
1533         /*
1534          * Allocate the reserved buffer heads.
1535          */
1536         while (nr_buffer_heads < NR_RESERVED) {
1537                 struct buffer_head * bh;
1538
1539                 bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
1540                 if (!bh)
1541                         break;
1542                 put_unused_buffer_head(bh);
1543                 nr_buffer_heads++;
1544         }
1545
1546         lru_list[BUF_CLEAN] = 0;
1547         grow_buffers(BLOCK_SIZE);
1548 }
1549
1550
1551 /* ====================== bdflush support =================== */
1552
1553 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1554  * response to dirty buffers.  Once this process is activated, we write back
1555  * a limited number of buffers to the disks and then go back to sleep again.
1556  */
1557 static struct wait_queue * bdflush_done = NULL;
1558 struct task_struct *bdflush_tsk = 0;
1559
1560 void wakeup_bdflush(int wait)
1561 {
1562         if (current == bdflush_tsk)
1563                 return;
1564         wake_up_process(bdflush_tsk);
1565         if (wait) {
1566                 run_task_queue(&tq_disk);
1567                 sleep_on(&bdflush_done);
1568         }
1569 }
1570
1571
1572 /*
1573  * Here we attempt to write back old buffers.
1574  * To prevent deadlocks for a loop device:
1575  * 1) Do non-blocking writes to loop (avoids deadlock with running
1576  *      out of request blocks).
1577  * 2) But do a blocking write if the only dirty buffers are loop buffers
1578  *      (otherwise we go into an infinite busy-loop).
1579  * 3) Quit writing loop blocks if a freelist went low (avoids deadlock
1580  *      with running out of free buffers for loop's "real" device).
1581 */
1582
1583 static inline void sync_old_buffers(void)
1584 {
1585         int i;
1586         int ndirty = 0;
1587         int wrta_cmd = WRITEA;
1588 #ifdef DEBUG
1589         int ncount = 0, nwritten = 0;
1590 #endif
1591         struct buffer_head * bh, *next;
1592
1593 #ifdef DEBUG
1594         bh = lru_list[BUF_CLEAN];
1595         if(bh)
1596                 for(i = nr_buffers_type[BUF_CLEAN]; --i > 0; bh = next) {
1597                         next = bh->b_next_free;
1598
1599                         /* Dirty/locked buffer on clean list?  Refile it */
1600                         if (buffer_locked(bh) || buffer_dirty(bh)) {
1601                                 ncount++;
1602                                 refile_buffer(bh);
1603                         }
1604                 }
1605 #endif
1606
1607         bh = lru_list[BUF_LOCKED];
1608         if(bh)
1609                 for(i = nr_buffers_type[BUF_LOCKED]; --i > 0; bh = next) {
1610                         next = bh->b_next_free;
1611
1612                         /* Unlocked buffer on locked list?  Refile it */
1613                         if (!buffer_locked(bh))
1614                                 refile_buffer(bh);
1615                 }
1616
1617  restart:
1618         bh = lru_list[BUF_DIRTY];
1619         if(bh)
1620                 for (i = nr_buffers_type[BUF_DIRTY];
1621                      i-- > 0 && ndirty < bdf_prm.b_un.ndirty;
1622                      bh = next) {
1623                         /* We may have stalled while waiting for
1624                            I/O to complete. */
1625                         if(bh->b_list != BUF_DIRTY)
1626                                 goto restart;
1627                         next = bh->b_next_free;
1628                         if(!lru_list[BUF_DIRTY]) {
1629                                 printk("Dirty list empty %d\n", i);
1630                                 break;
1631                         }
1632
1633                         /* Clean buffer on dirty list?  Refile it */
1634                         if (!buffer_dirty(bh)) {
1635                                 refile_buffer(bh);
1636                                 continue;
1637                         }
1638
1639                         if (buffer_locked(bh))
1640                                 continue;
1641                         /* Should we write back buffers that are
1642                            shared or not??  Currently dirty buffers
1643                            are not shared, so it does not matter */
1644                         next->b_count++;
1645                         bh->b_count++;
1646                         ndirty++;
1647                         bh->b_flushtime = 0;
1648                         if (MAJOR(bh->b_dev) == LOOP_MAJOR) {
1649                                 ll_rw_block(wrta_cmd,1, &bh);
1650                                 wrta_cmd = WRITEA;
1651                                 if (buffer_dirty(bh))
1652                                         --ndirty;
1653                         }
1654                         else
1655                                 ll_rw_block(WRITE, 1, &bh);
1656                         bh->b_count--;
1657                         next->b_count--;
1658                 }
1659         /* If we didn't write anything, but there are still
1660          * dirty buffers, then make the next write to a
1661          * loop device to be a blocking write.
1662          * This lets us block--which we _must_ do! */
1663         if (ndirty == 0
1664             && nr_buffers_type[BUF_DIRTY] > 0 && wrta_cmd != WRITE) {
1665                 wrta_cmd = WRITE;
1666                 goto restart;
1667         }
1668
1669 #ifdef DEBUG
1670         if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
1671         printk("wrote %d/%d buffers...", nwritten, ndirty);
1672 #endif
1673         run_task_queue(&tq_disk);
1674 }
1675
1676
1677 /* This is the interface to bdflush.  As we get more sophisticated, we can
1678  * pass tuning parameters to this "process", to adjust how it behaves.
1679  * We would want to verify each parameter, however, to make sure that it
1680  * is reasonable. */
1681
1682 asmlinkage int sys_bdflush(int func, long data)
1683 {
1684         int i, error = -EPERM;
1685
1686         lock_kernel();
1687         if (!capable(CAP_SYS_ADMIN))
1688                 goto out;
1689
1690         if (func == 1)
1691                 /* Func 1 used to call sync_old_buffers; a user space
1692                    daemon would call it periodically.  This is no
1693                    longer necessary.  Returning -EPERM here makes the
1694                    daemon silently exit.  */
1695                 goto out;
1696
1697         /* Basically func 1 means read param 1, 2 means write param 1, etc */
1698         if (func >= 2) {
1699                 i = (func-2) >> 1;
1700                 error = -EINVAL;
1701                 if (i < 0 || i >= N_PARAM)
1702                         goto out;
1703                 if((func & 1) == 0) {
1704                         error = put_user(bdf_prm.data[i], (int*)data);
1705                         goto out;
1706                 }
1707                 if (data < bdflush_min[i] || data > bdflush_max[i])
1708                         goto out;
1709                 bdf_prm.data[i] = data;
1710                 error = 0;
1711                 goto out;
1712         };
1713
1714         /* Having func 0 used to launch the actual bdflush and then never
1715          * return (unless explicitly killed). We return zero here to
1716          * remain semi-compatible with present update(8) programs.
1717          */
1718         error = 0;
1719 out:
1720         unlock_kernel();
1721         return error;
1722 }
1723
1724 /* This is the actual bdflush daemon itself. It used to be started
1725  * from the syscall above, but now we launch it ourselves internally
1726  * with kernel_thread(...)  directly after the first thread in
1727  * init/main.c.  Every so often, or when woken up by another task that
1728  * needs memory, we call sync_old_buffers to partially clear the dirty list.
1729  */
1730
1731 int bdflush(void * unused)
1732 {
1733         long remaining = HZ * bdf_prm.b_un.interval;
1734         struct task_struct *tsk = current;
1735
1736         /*
1737          *      We have a bare-bones task_struct, and really should fill
1738          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
1739          *      display semi-sane things. Not real crucial though...
1740          */
1741
1742         tsk->session = 1;
1743         tsk->pgrp = 1;
1744         tsk->dumpable = 0;  /* inhibit ptrace() */
1745         strcpy(tsk->comm, "kflushd");
1746         sigfillset(&tsk->blocked);
1747         bdflush_tsk = tsk;
1748
1749         /*
1750          *      As a kernel thread we want to tamper with system buffers
1751          *      and other internals and thus be subject to the SMP locking
1752          *      rules. (On a uniprocessor box this does nothing).
1753          */
1754         lock_kernel();
1755
1756         for (;;) {
1757                 tsk->state = TASK_INTERRUPTIBLE;
1758                 remaining = schedule_timeout(remaining);
1759
1760 #ifdef DEBUG
1761                 printk("bdflush() activated...");
1762 #endif
1763                 CHECK_EMERGENCY_SYNC
1764
1765                 if (remaining == 0) {
1766                         /*
1767                          * Also try to flush inodes and supers, since
1768                          * otherwise there would be no way of ensuring
1769                          * that these quantities ever get written
1770                          * back.  Ideally, we would have a timestamp
1771                          * on the inodes and superblocks so that we
1772                          * could write back only the old ones.
1773                          */
1774                         sync_supers(0);
1775                         sync_inodes(0);
1776                         remaining = HZ * bdf_prm.b_un.interval;
1777                 }
1778
1779                 /* Keep flushing till there aren't very many dirty buffers */
1780                 do {
1781                         sync_old_buffers();
1782                 } while(nr_buffers_type[BUF_DIRTY] > nr_buffers * bdf_prm.b_un.nfract/100);
1783
1784                 wake_up(&bdflush_done);
1785 #ifdef DEBUG
1786                 printk("sleeping again.\n");
1787 #endif
1788         }
1789 }