fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 #include <linux/sched.h>
  30 #include <linux/fs.h>
  31 #include <linux/malloc.h>
  32 #include <linux/locks.h>
  33 #include <linux/errno.h>
  34 #include <linux/swap.h>
  35 #include <linux/swapctl.h>
  36 #include <linux/smp_lock.h>
  37 #include <linux/vmalloc.h>
  38 #include <linux/blkdev.h>
  39 #include <linux/sysrq.h>
  40 #include <linux/file.h>
  41 #include <linux/init.h>
  42 #include <linux/quotaops.h>
  43 #include <linux/iobuf.h>
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/io.h>
  47 #include <asm/bitops.h>
  48 #include <asm/mmu_context.h>
  49
  50 #define NR_SIZES 7
  51 static char buffersize_index[65] =
  52 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  53   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  54   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  55  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  56   6};
  57
  58 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  59 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  60 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  61 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  62                                              number of unused buffer heads */
  63
  64 /* Anti-deadlock ordering:
  65  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  66  */
  67
  68 /*
  69  * Hash table gook..
  70  */
  71 static unsigned int bh_hash_mask = 0;
  72 static unsigned int bh_hash_shift = 0;
  73 static struct buffer_head **hash_table;
  74 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  75
  76 static struct buffer_head *lru_list[NR_LIST];
  77 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  78 static int nr_buffers_type[NR_LIST] = {0,};
  79
  80 static struct buffer_head * unused_list = NULL;
  81 static int nr_unused_buffer_heads = 0;
  82 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  83 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  84
  85 struct bh_free_head {
  86         struct buffer_head *list;
  87         spinlock_t lock;
  88 };
  89 static struct bh_free_head free_list[NR_SIZES];
  90
  91 static kmem_cache_t *bh_cachep;
  92
  93 static int grow_buffers(int size);
  94
  95 /* This is used by some architectures to estimate available memory. */
  96 atomic_t buffermem = ATOMIC_INIT(0);
  97
  98 /* Here is the parameter block for the bdflush process. If you add or
  99  * remove any of the parameters, make sure to update kernel/sysctl.c.
 100  */
 101
 102 #define N_PARAM 9
 103
 104 /* The dummy values in this structure are left in there for compatibility
 105  * with old programs that play with the /proc entries.
 106  */
 107 union bdflush_param {
 108         struct {
 109                 int nfract;  /* Percentage of buffer cache dirty to
 110                                 activate bdflush */
 111                 int ndirty;  /* Maximum number of dirty blocks to write out per
 112                                 wake-cycle */
 113                 int nrefill; /* Number of clean buffers to try to obtain
 114                                 each time we call refill */
 115                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 116                                   when trying to refill buffers. */
 117                 int dummy1;    /* unused */
 118                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 119                 int age_super;  /* Time for superblock to age before we flush it */
 120                 int dummy2;    /* unused */
 121                 int dummy3;    /* unused */
 122         } b_un;
 123         unsigned int data[N_PARAM];
 124 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
 125
 126 /* These are the min and max parameter values that we will allow to be assigned */
 127 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 128 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
 129
 130 void wakeup_bdflush(int);
 131
 132 /*
 133  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 134  * and getting rid of the cli-sti pairs. The wait-queue routines still
 135  * need cli-sti, but now it's just a couple of 386 instructions or so.
 136  *
 137  * Note that the real wait_on_buffer() is an inline function that checks
 138  * if 'b_wait' is set before calling this, so that the queues aren't set
 139  * up unnecessarily.
 140  */
 141 void __wait_on_buffer(struct buffer_head * bh)
 142 {
 143         struct task_struct *tsk = current;
 144         DECLARE_WAITQUEUE(wait, tsk);
 145
 146         atomic_inc(&bh->b_count);
 147         add_wait_queue(&bh->b_wait, &wait);
 148 repeat:
 149         tsk->state = TASK_UNINTERRUPTIBLE;
 150         run_task_queue(&tq_disk);
 151         if (buffer_locked(bh)) {
 152                 schedule();
 153                 goto repeat;
 154         }
 155         tsk->state = TASK_RUNNING;
 156         remove_wait_queue(&bh->b_wait, &wait);
 157         atomic_dec(&bh->b_count);
 158 }
 159
 160 /* Call sync_buffers with wait!=0 to ensure that the call does not
 161  * return until all buffer writes have completed.  Sync() may return
 162  * before the writes have finished; fsync() may not.
 163  */
 164
 165 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 166  * spontaneously dirty themselves without ever brelse being called.
 167  * We will ultimately want to put these in a separate list, but for
 168  * now we search all of the lists for dirty buffers.
 169  */
 170 static int sync_buffers(kdev_t dev, int wait)
 171 {
 172         int i, retry, pass = 0, err = 0;
 173         struct buffer_head * bh, *next;
 174
 175         /* One pass for no-wait, three for wait:
 176          * 0) write out all dirty, unlocked buffers;
 177          * 1) write out all dirty buffers, waiting if locked;
 178          * 2) wait for completion by waiting for all buffers to unlock.
 179          */
 180         do {
 181                 retry = 0;
 182
 183                 /* We search all lists as a failsafe mechanism, not because we expect
 184                  * there to be dirty buffers on any of the other lists.
 185                  */
 186 repeat:
 187                 spin_lock(&lru_list_lock);
 188                 bh = lru_list[BUF_DIRTY];
 189                 if (!bh)
 190                         goto repeat2;
 191
 192                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 193                         next = bh->b_next_free;
 194
 195                         if (!lru_list[BUF_DIRTY])
 196                                 break;
 197                         if (dev && bh->b_dev != dev)
 198                                 continue;
 199                         if (buffer_locked(bh)) {
 200                                 /* Buffer is locked; skip it unless wait is
 201                                  * requested AND pass > 0.
 202                                  */
 203                                 if (!wait || !pass) {
 204                                         retry = 1;
 205                                         continue;
 206                                 }
 207                                 atomic_inc(&bh->b_count);
 208                                 spin_unlock(&lru_list_lock);
 209                                 wait_on_buffer (bh);
 210                                 atomic_dec(&bh->b_count);
 211                                 goto repeat;
 212                         }
 213
 214                         /* If an unlocked buffer is not uptodate, there has
 215                          * been an IO error. Skip it.
 216                          */
 217                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 218                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 219                                 err = -EIO;
 220                                 continue;
 221                         }
 222
 223                         /* Don't write clean buffers.  Don't write ANY buffers
 224                          * on the third pass.
 225                          */
 226                         if (!buffer_dirty(bh) || pass >= 2)
 227                                 continue;
 228
 229                         atomic_inc(&bh->b_count);
 230                         bh->b_flushtime = 0;
 231                         spin_unlock(&lru_list_lock);
 232                         ll_rw_block(WRITE, 1, &bh);
 233                         atomic_dec(&bh->b_count);
 234                         retry = 1;
 235                         goto repeat;
 236                 }
 237
 238     repeat2:
 239                 bh = lru_list[BUF_LOCKED];
 240                 if (!bh) {
 241                         spin_unlock(&lru_list_lock);
 242                         break;
 243                 }
 244                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 245                         next = bh->b_next_free;
 246
 247                         if (!lru_list[BUF_LOCKED])
 248                                 break;
 249                         if (dev && bh->b_dev != dev)
 250                                 continue;
 251                         if (buffer_locked(bh)) {
 252                                 /* Buffer is locked; skip it unless wait is
 253                                  * requested AND pass > 0.
 254                                  */
 255                                 if (!wait || !pass) {
 256                                         retry = 1;
 257                                         continue;
 258                                 }
 259                                 atomic_inc(&bh->b_count);
 260                                 spin_unlock(&lru_list_lock);
 261                                 wait_on_buffer (bh);
 262                                 spin_lock(&lru_list_lock);
 263                                 atomic_dec(&bh->b_count);
 264                                 goto repeat2;
 265                         }
 266                 }
 267                 spin_unlock(&lru_list_lock);
 268
 269                 /* If we are waiting for the sync to succeed, and if any dirty
 270                  * blocks were written, then repeat; on the second pass, only
 271                  * wait for buffers being written (do not pass to write any
 272                  * more buffers on the second pass).
 273                  */
 274         } while (wait && retry && ++pass<=2);
 275         return err;
 276 }
 277
 278 void sync_dev(kdev_t dev)
 279 {
 280         sync_buffers(dev, 0);
 281         sync_supers(dev);
 282         sync_inodes(dev);
 283         sync_buffers(dev, 0);
 284         DQUOT_SYNC(dev);
 285         /*
 286          * FIXME(eric) we need to sync the physical devices here.
 287          * This is because some (scsi) controllers have huge amounts of
 288          * cache onboard (hundreds of Mb), and we need to instruct
 289          * them to commit all of the dirty memory to disk, and we should
 290          * not return until this has happened.
 291          *
 292          * This would need to get implemented by going through the assorted
 293          * layers so that each block major number can be synced, and this
 294          * would call down into the upper and mid-layer scsi.
 295          */
 296 }
 297
 298 int fsync_dev(kdev_t dev)
 299 {
 300         sync_buffers(dev, 0);
 301
 302         lock_kernel();
 303         sync_supers(dev);
 304         sync_inodes(dev);
 305         DQUOT_SYNC(dev);
 306         unlock_kernel();
 307
 308         return sync_buffers(dev, 1);
 309 }
 310
 311 asmlinkage int sys_sync(void)
 312 {
 313         fsync_dev(0);
 314         return 0;
 315 }
 316
 317 /*
 318  *      filp may be NULL if called via the msync of a vma.
 319  */
 320
 321 int file_fsync(struct file *filp, struct dentry *dentry)
 322 {
 323         struct inode * inode = dentry->d_inode;
 324         struct super_block * sb;
 325         kdev_t dev;
 326
 327         /* sync the inode to buffers */
 328         write_inode_now(inode);
 329
 330         /* sync the superblock to buffers */
 331         sb = inode->i_sb;
 332         wait_on_super(sb);
 333         if (sb->s_op && sb->s_op->write_super)
 334                 sb->s_op->write_super(sb);
 335
 336         /* .. finally sync the buffers to disk */
 337         dev = inode->i_dev;
 338         return sync_buffers(dev, 1);
 339 }
 340
 341 asmlinkage int sys_fsync(unsigned int fd)
 342 {
 343         struct file * file;
 344         struct dentry * dentry;
 345         struct inode * inode;
 346         int err;
 347
 348         lock_kernel();
 349         err = -EBADF;
 350         file = fget(fd);
 351         if (!file)
 352                 goto out;
 353
 354         dentry = file->f_dentry;
 355         if (!dentry)
 356                 goto out_putf;
 357
 358         inode = dentry->d_inode;
 359         if (!inode)
 360                 goto out_putf;
 361
 362         err = -EINVAL;
 363         if (!file->f_op || !file->f_op->fsync)
 364                 goto out_putf;
 365
 366         /* We need to protect against concurrent writers.. */
 367         down(&inode->i_sem);
 368         err = file->f_op->fsync(file, dentry);
 369         up(&inode->i_sem);
 370
 371 out_putf:
 372         fput(file);
 373 out:
 374         unlock_kernel();
 375         return err;
 376 }
 377
 378 asmlinkage int sys_fdatasync(unsigned int fd)
 379 {
 380         struct file * file;
 381         struct dentry * dentry;
 382         struct inode * inode;
 383         int err;
 384
 385         lock_kernel();
 386         err = -EBADF;
 387         file = fget(fd);
 388         if (!file)
 389                 goto out;
 390
 391         dentry = file->f_dentry;
 392         if (!dentry)
 393                 goto out_putf;
 394
 395         inode = dentry->d_inode;
 396         if (!inode)
 397                 goto out_putf;
 398
 399         err = -EINVAL;
 400         if (!file->f_op || !file->f_op->fsync)
 401                 goto out_putf;
 402
 403         /* this needs further work, at the moment it is identical to fsync() */
 404         down(&inode->i_sem);
 405         err = file->f_op->fsync(file, dentry);
 406         up(&inode->i_sem);
 407
 408 out_putf:
 409         fput(file);
 410 out:
 411         unlock_kernel();
 412         return err;
 413 }
 414
 415 void invalidate_buffers(kdev_t dev)
 416 {
 417         int nlist;
 418
 419         spin_lock(&lru_list_lock);
 420         for(nlist = 0; nlist < NR_LIST; nlist++) {
 421                 struct buffer_head * bh;
 422                 int i;
 423         retry:
 424                 bh = lru_list[nlist];
 425                 if (!bh)
 426                         continue;
 427                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 428                         if (bh->b_dev != dev)
 429                                 continue;
 430                         if (buffer_locked(bh)) {
 431                                 atomic_inc(&bh->b_count);
 432                                 spin_unlock(&lru_list_lock);
 433                                 wait_on_buffer(bh);
 434                                 spin_lock(&lru_list_lock);
 435                                 atomic_dec(&bh->b_count);
 436                                 goto retry;
 437                         }
 438                         if (atomic_read(&bh->b_count))
 439                                 continue;
 440                         bh->b_flushtime = 0;
 441                         clear_bit(BH_Protected, &bh->b_state);
 442                         clear_bit(BH_Uptodate, &bh->b_state);
 443                         clear_bit(BH_Dirty, &bh->b_state);
 444                         clear_bit(BH_Req, &bh->b_state);
 445                 }
 446         }
 447         spin_unlock(&lru_list_lock);
 448 }
 449
 450 /* After several hours of tedious analysis, the following hash
 451  * function won.  Do not mess with it... -DaveM
 452  */
 453 #define _hashfn(dev,block)      \
 454         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 455          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 456 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 457
 458 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 459 {
 460         if ((bh->b_next = *head) != NULL)
 461                 bh->b_next->b_pprev = &bh->b_next;
 462         *head = bh;
 463         bh->b_pprev = head;
 464 }
 465
 466 static __inline__ void __hash_unlink(struct buffer_head *bh)
 467 {
 468         if (bh->b_next)
 469                 bh->b_next->b_pprev = bh->b_pprev;
 470         *(bh->b_pprev) = bh->b_next;
 471         bh->b_pprev = NULL;
 472 }
 473
 474 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 475 {
 476         struct buffer_head **bhp = &lru_list[blist];
 477
 478         if(!*bhp) {
 479                 *bhp = bh;
 480                 bh->b_prev_free = bh;
 481         }
 482         bh->b_next_free = *bhp;
 483         bh->b_prev_free = (*bhp)->b_prev_free;
 484         (*bhp)->b_prev_free->b_next_free = bh;
 485         (*bhp)->b_prev_free = bh;
 486         nr_buffers_type[blist]++;
 487 }
 488
 489 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 490 {
 491         if (bh->b_prev_free || bh->b_next_free) {
 492                 bh->b_prev_free->b_next_free = bh->b_next_free;
 493                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 494                 if (lru_list[blist] == bh)
 495                         lru_list[blist] = bh->b_next_free;
 496                 if (lru_list[blist] == bh)
 497                         lru_list[blist] = NULL;
 498                 bh->b_next_free = bh->b_prev_free = NULL;
 499                 nr_buffers_type[blist]--;
 500         }
 501 }
 502
 503 static void __remove_from_free_list(struct buffer_head * bh, int index)
 504 {
 505         if(bh->b_next_free == bh)
 506                  free_list[index].list = NULL;
 507         else {
 508                 bh->b_prev_free->b_next_free = bh->b_next_free;
 509                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 510                 if (free_list[index].list == bh)
 511                          free_list[index].list = bh->b_next_free;
 512         }
 513         bh->b_next_free = bh->b_prev_free = NULL;
 514 }
 515
 516 /* The following two functions must operate atomically
 517  * because they control the visibility of a buffer head
 518  * to the rest of the kernel.
 519  */
 520 static __inline__ void __remove_from_queues(struct buffer_head *bh)
 521 {
 522         write_lock(&hash_table_lock);
 523         if (bh->b_pprev)
 524                 __hash_unlink(bh);
 525         __remove_from_lru_list(bh, bh->b_list);
 526         write_unlock(&hash_table_lock);
 527 }
 528
 529 static void insert_into_queues(struct buffer_head *bh)
 530 {
 531         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 532
 533         spin_lock(&lru_list_lock);
 534         write_lock(&hash_table_lock);
 535         __hash_link(bh, head);
 536         __insert_into_lru_list(bh, bh->b_list);
 537         write_unlock(&hash_table_lock);
 538         spin_unlock(&lru_list_lock);
 539 }
 540
 541 /* This function must only run if there are no other
 542  * references _anywhere_ to this buffer head.
 543  */
 544 static void put_last_free(struct buffer_head * bh)
 545 {
 546         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 547         struct buffer_head **bhp = &head->list;
 548
 549         spin_lock(&head->lock);
 550         bh->b_dev = B_FREE;
 551         if(!*bhp) {
 552                 *bhp = bh;
 553                 bh->b_prev_free = bh;
 554         }
 555         bh->b_next_free = *bhp;
 556         bh->b_prev_free = (*bhp)->b_prev_free;
 557         (*bhp)->b_prev_free->b_next_free = bh;
 558         (*bhp)->b_prev_free = bh;
 559         spin_unlock(&head->lock);
 560 }
 561
 562 /*
 563  * Why like this, I hear you say... The reason is race-conditions.
 564  * As we don't lock buffers (unless we are reading them, that is),
 565  * something might happen to it while we sleep (ie a read-error
 566  * will force it bad). This shouldn't really happen currently, but
 567  * the code is ready.
 568  */
 569 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 570 {
 571         struct buffer_head **head = &hash(dev, block);
 572         struct buffer_head *bh;
 573
 574         read_lock(&hash_table_lock);
 575         for(bh = *head; bh; bh = bh->b_next)
 576                 if (bh->b_blocknr == block      &&
 577                     bh->b_size    == size       &&
 578                     bh->b_dev     == dev)
 579                         break;
 580         if (bh)
 581                 atomic_inc(&bh->b_count);
 582         read_unlock(&hash_table_lock);
 583
 584         return bh;
 585 }
 586
 587 unsigned int get_hardblocksize(kdev_t dev)
 588 {
 589         /*
 590          * Get the hard sector size for the given device.  If we don't know
 591          * what it is, return 0.
 592          */
 593         if (hardsect_size[MAJOR(dev)] != NULL) {
 594                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 595                 if (blksize != 0)
 596                         return blksize;
 597         }
 598
 599         /*
 600          * We don't know what the hardware sector size for this device is.
 601          * Return 0 indicating that we don't know.
 602          */
 603         return 0;
 604 }
 605
 606 void set_blocksize(kdev_t dev, int size)
 607 {
 608         extern int *blksize_size[];
 609         int i, nlist;
 610         struct buffer_head * bh, *bhnext;
 611
 612         if (!blksize_size[MAJOR(dev)])
 613                 return;
 614
 615         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 616         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 617                 panic("Invalid blocksize passed to set_blocksize");
 618
 619         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 620                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 621                 return;
 622         }
 623         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 624                 return;
 625         sync_buffers(dev, 2);
 626         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 627
 628         /* We need to be quite careful how we do this - we are moving entries
 629          * around on the free list, and we can get in a loop if we are not careful.
 630          */
 631         for(nlist = 0; nlist < NR_LIST; nlist++) {
 632         repeat:
 633                 spin_lock(&lru_list_lock);
 634                 bh = lru_list[nlist];
 635                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 636                         if(!bh)
 637                                 break;
 638
 639                         bhnext = bh->b_next_free;
 640                         if (bh->b_dev != dev)
 641                                  continue;
 642                         if (bh->b_size == size)
 643                                  continue;
 644                         if (buffer_locked(bh)) {
 645                                 atomic_inc(&bh->b_count);
 646                                 spin_unlock(&lru_list_lock);
 647                                 wait_on_buffer(bh);
 648                                 atomic_dec(&bh->b_count);
 649                                 goto repeat;
 650                         }
 651                         if (bh->b_dev == dev && bh->b_size != size) {
 652                                 clear_bit(BH_Dirty, &bh->b_state);
 653                                 clear_bit(BH_Uptodate, &bh->b_state);
 654                                 clear_bit(BH_Req, &bh->b_state);
 655                                 bh->b_flushtime = 0;
 656                         }
 657                         if (atomic_read(&bh->b_count) == 0) {
 658                                 __remove_from_queues(bh);
 659                                 put_last_free(bh);
 660                         }
 661                 }
 662                 spin_unlock(&lru_list_lock);
 663         }
 664 }
 665
 666 /*
 667  * We used to try various strange things. Let's not.
 668  */
 669 static void refill_freelist(int size)
 670 {
 671         if (!grow_buffers(size)) {
 672                 wakeup_bdflush(1);
 673                 current->policy |= SCHED_YIELD;
 674                 schedule();
 675         }
 676 }
 677
 678 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 679 {
 680         bh->b_list = BUF_CLEAN;
 681         bh->b_flushtime = 0;
 682         bh->b_end_io = handler;
 683         bh->b_dev_id = dev_id;
 684 }
 685
 686 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 687 {
 688         mark_buffer_uptodate(bh, uptodate);
 689         unlock_buffer(bh);
 690 }
 691
 692 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 693 {
 694         mark_buffer_uptodate(bh, uptodate);
 695         unlock_buffer(bh);
 696         BUG();
 697 }
 698
 699 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 700 {
 701         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 702         unsigned long flags;
 703         struct buffer_head *tmp;
 704         struct page *page;
 705         int free;
 706
 707         mark_buffer_uptodate(bh, uptodate);
 708
 709         /* This is a temporary buffer used for page I/O. */
 710         page = mem_map + MAP_NR(bh->b_data);
 711
 712         if (!uptodate)
 713                 SetPageError(page);
 714
 715         /*
 716          * Be _very_ careful from here on. Bad things can happen if
 717          * two buffer heads end IO at almost the same time and both
 718          * decide that the page is now completely done.
 719          *
 720          * Async buffer_heads are here only as labels for IO, and get
 721          * thrown away once the IO for this page is complete.  IO is
 722          * deemed complete once all buffers have been visited
 723          * (b_count==0) and are now unlocked. We must make sure that
 724          * only the _last_ buffer that decrements its count is the one
 725          * that free's the page..
 726          */
 727         spin_lock_irqsave(&page_uptodate_lock, flags);
 728         unlock_buffer(bh);
 729         atomic_dec(&bh->b_count);
 730         tmp = bh->b_this_page;
 731         while (tmp != bh) {
 732                 if (atomic_read(&tmp->b_count) &&
 733                     (tmp->b_end_io == end_buffer_io_async))
 734                         goto still_busy;
 735                 tmp = tmp->b_this_page;
 736         }
 737
 738         /* OK, the async IO on this page is complete. */
 739         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 740
 741         /*
 742          * if none of the buffers had errors then we can set the
 743          * page uptodate:
 744          */
 745         if (!PageError(page))
 746                 SetPageUptodate(page);
 747
 748         /*
 749          * Run the hooks that have to be done when a page I/O has completed.
 750          *
 751          * Note - we need to test the flags before we unlock the page, but
 752          * we must not actually free the page until after the unlock!
 753          */
 754         if (test_and_clear_bit(PG_decr_after, &page->flags))
 755                 atomic_dec(&nr_async_pages);
 756
 757         if (test_and_clear_bit(PG_free_swap_after, &page->flags))
 758                 swap_free(page->offset);
 759
 760         free = test_and_clear_bit(PG_free_after, &page->flags);
 761
 762         if (page->owner != (void *)-1)
 763                 PAGE_BUG(page);
 764         page->owner = current;
 765         UnlockPage(page);
 766
 767         if (free)
 768                 __free_page(page);
 769
 770         return;
 771
 772 still_busy:
 773         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 774         return;
 775 }
 776
 777
 778 /*
 779  * Ok, this is getblk, and it isn't very clear, again to hinder
 780  * race-conditions. Most of the code is seldom used, (ie repeating),
 781  * so it should be much more efficient than it looks.
 782  *
 783  * The algorithm is changed: hopefully better, and an elusive bug removed.
 784  *
 785  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 786  * when the filesystem starts to get full of dirty blocks (I hope).
 787  */
 788 struct buffer_head * getblk(kdev_t dev, int block, int size)
 789 {
 790         struct buffer_head * bh;
 791         int isize;
 792
 793 repeat:
 794         bh = get_hash_table(dev, block, size);
 795         if (bh) {
 796                 if (!buffer_dirty(bh)) {
 797                         bh->b_flushtime = 0;
 798                 }
 799                 goto out;
 800         }
 801
 802         isize = BUFSIZE_INDEX(size);
 803         spin_lock(&free_list[isize].lock);
 804         bh = free_list[isize].list;
 805         if (bh) {
 806                 __remove_from_free_list(bh, isize);
 807                 atomic_set(&bh->b_count, 1);
 808         }
 809         spin_unlock(&free_list[isize].lock);
 810         if (!bh)
 811                 goto refill;
 812
 813         /* OK, FINALLY we know that this buffer is the only one of its kind,
 814          * we hold a reference (b_count>0), it is unlocked, and it is clean.
 815          */
 816         init_buffer(bh, end_buffer_io_sync, NULL);
 817         bh->b_dev = dev;
 818         bh->b_blocknr = block;
 819         bh->b_state = 1 << BH_Mapped;
 820
 821         /* Insert the buffer into the regular lists */
 822         insert_into_queues(bh);
 823         goto out;
 824
 825         /*
 826          * If we block while refilling the free list, somebody may
 827          * create the buffer first ... search the hashes again.
 828          */
 829 refill:
 830         refill_freelist(size);
 831         goto repeat;
 832 out:
 833         return bh;
 834 }
 835
 836 /*
 837  * if a new dirty buffer is created we need to balance bdflush.
 838  *
 839  * in the future we might want to make bdflush aware of different
 840  * pressures on different devices - thus the (currently unused)
 841  * 'dev' parameter.
 842  */
 843 int too_many_dirty_buffers;
 844
 845 void balance_dirty(kdev_t dev)
 846 {
 847         int dirty = nr_buffers_type[BUF_DIRTY];
 848         int ndirty = bdf_prm.b_un.ndirty;
 849
 850         if (dirty > ndirty) {
 851                 if (dirty > 2*ndirty) {
 852                         too_many_dirty_buffers = 1;
 853                         wakeup_bdflush(1);
 854                         return;
 855                 }
 856                 wakeup_bdflush(0);
 857         }
 858         too_many_dirty_buffers = 0;
 859         return;
 860 }
 861
 862 static inline void __mark_dirty(struct buffer_head *bh, int flag)
 863 {
 864         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 865         clear_bit(BH_New, &bh->b_state);
 866         refile_buffer(bh);
 867 }
 868
 869 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 870 {
 871         __mark_dirty(bh, flag);
 872 }
 873
 874 /*
 875  * A buffer may need to be moved from one buffer list to another
 876  * (e.g. in case it is not shared any more). Handle this.
 877  */
 878 static __inline__ void __refile_buffer(struct buffer_head *bh)
 879 {
 880         int dispose = BUF_CLEAN;
 881         if (buffer_locked(bh))
 882                 dispose = BUF_LOCKED;
 883         if (buffer_dirty(bh))
 884                 dispose = BUF_DIRTY;
 885         if (dispose != bh->b_list) {
 886                 __remove_from_lru_list(bh, bh->b_list);
 887                 bh->b_list = dispose;
 888                 __insert_into_lru_list(bh, dispose);
 889         }
 890 }
 891
 892 void refile_buffer(struct buffer_head *bh)
 893 {
 894         spin_lock(&lru_list_lock);
 895         __refile_buffer(bh);
 896         spin_unlock(&lru_list_lock);
 897 }
 898
 899 /*
 900  * Release a buffer head
 901  */
 902 void __brelse(struct buffer_head * buf)
 903 {
 904         touch_buffer(buf);
 905
 906         if (atomic_read(&buf->b_count)) {
 907                 atomic_dec(&buf->b_count);
 908                 return;
 909         }
 910         printk("VFS: brelse: Trying to free free buffer\n");
 911 }
 912
 913 /*
 914  * bforget() is like brelse(), except it puts the buffer on the
 915  * free list if it can.. We can NOT free the buffer if:
 916  *  - there are other users of it
 917  *  - it is locked and thus can have active IO
 918  */
 919 void __bforget(struct buffer_head * buf)
 920 {
 921         spin_lock(&lru_list_lock);
 922         write_lock(&hash_table_lock);
 923         if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
 924                 touch_buffer(buf);
 925                 atomic_dec(&buf->b_count);
 926         } else {
 927                 atomic_set(&buf->b_count, 0);
 928                 buf->b_state = 0;
 929                 if (buf->b_pprev)
 930                         __hash_unlink(buf);
 931                 __remove_from_lru_list(buf, buf->b_list);
 932                 put_last_free(buf);
 933         }
 934         write_unlock(&hash_table_lock);
 935         spin_unlock(&lru_list_lock);
 936 }
 937
 938 /*
 939  * bread() reads a specified block and returns the buffer that contains
 940  * it. It returns NULL if the block was unreadable.
 941  */
 942 struct buffer_head * bread(kdev_t dev, int block, int size)
 943 {
 944         struct buffer_head * bh;
 945
 946         bh = getblk(dev, block, size);
 947         if (buffer_uptodate(bh))
 948                 return bh;
 949         ll_rw_block(READ, 1, &bh);
 950         wait_on_buffer(bh);
 951         if (buffer_uptodate(bh))
 952                 return bh;
 953         brelse(bh);
 954         return NULL;
 955 }
 956
 957 /*
 958  * Ok, breada can be used as bread, but additionally to mark other
 959  * blocks for reading as well. End the argument list with a negative
 960  * number.
 961  */
 962
 963 #define NBUF 16
 964
 965 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 966         unsigned int pos, unsigned int filesize)
 967 {
 968         struct buffer_head * bhlist[NBUF];
 969         unsigned int blocks;
 970         struct buffer_head * bh;
 971         int index;
 972         int i, j;
 973
 974         if (pos >= filesize)
 975                 return NULL;
 976
 977         if (block < 0)
 978                 return NULL;
 979
 980         bh = getblk(dev, block, bufsize);
 981         index = BUFSIZE_INDEX(bh->b_size);
 982
 983         if (buffer_uptodate(bh))
 984                 return(bh);
 985         else ll_rw_block(READ, 1, &bh);
 986
 987         blocks = (filesize - pos) >> (9+index);
 988
 989         if (blocks < (read_ahead[MAJOR(dev)] >> index))
 990                 blocks = read_ahead[MAJOR(dev)] >> index;
 991         if (blocks > NBUF)
 992                 blocks = NBUF;
 993
 994 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 995
 996         bhlist[0] = bh;
 997         j = 1;
 998         for(i=1; i<blocks; i++) {
 999                 bh = getblk(dev,block+i,bufsize);
1000                 if (buffer_uptodate(bh)) {
1001                         brelse(bh);
1002                         break;
1003                 }
1004                 else bhlist[j++] = bh;
1005         }
1006
1007         /* Request the read for these buffers, and then release them. */
1008         if (j>1)
1009                 ll_rw_block(READA, (j-1), bhlist+1);
1010         for(i=1; i<j; i++)
1011                 brelse(bhlist[i]);
1012
1013         /* Wait for this buffer, and then continue on. */
1014         bh = bhlist[0];
1015         wait_on_buffer(bh);
1016         if (buffer_uptodate(bh))
1017                 return bh;
1018         brelse(bh);
1019         return NULL;
1020 }
1021
1022 /*
1023  * Note: the caller should wake up the buffer_wait list if needed.
1024  */
1025 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1026 {
1027         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1028                 kmem_cache_free(bh_cachep, bh);
1029         } else {
1030                 bh->b_blocknr = -1;
1031                 init_waitqueue_head(&bh->b_wait);
1032                 nr_unused_buffer_heads++;
1033                 bh->b_next_free = unused_list;
1034                 bh->b_this_page = NULL;
1035                 unused_list = bh;
1036         }
1037 }
1038
1039 static void put_unused_buffer_head(struct buffer_head *bh)
1040 {
1041         spin_lock(&unused_list_lock);
1042         __put_unused_buffer_head(bh);
1043         spin_unlock(&unused_list_lock);
1044 }
1045
1046 /*
1047  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1048  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1049  * buffer heads is now handled in create_buffers().
1050  */
1051 static struct buffer_head * get_unused_buffer_head(int async)
1052 {
1053         struct buffer_head * bh;
1054
1055         spin_lock(&unused_list_lock);
1056         if (nr_unused_buffer_heads > NR_RESERVED) {
1057                 bh = unused_list;
1058                 unused_list = bh->b_next_free;
1059                 nr_unused_buffer_heads--;
1060                 spin_unlock(&unused_list_lock);
1061                 return bh;
1062         }
1063         spin_unlock(&unused_list_lock);
1064
1065         /* This is critical.  We can't swap out pages to get
1066          * more buffer heads, because the swap-out may need
1067          * more buffer-heads itself.  Thus SLAB_BUFFER.
1068          */
1069         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1070                 memset(bh, 0, sizeof(*bh));
1071                 init_waitqueue_head(&bh->b_wait);
1072                 return bh;
1073         }
1074
1075         /*
1076          * If we need an async buffer, use the reserved buffer heads.
1077          */
1078         if (async) {
1079                 spin_lock(&unused_list_lock);
1080                 if (unused_list) {
1081                         bh = unused_list;
1082                         unused_list = bh->b_next_free;
1083                         nr_unused_buffer_heads--;
1084                         spin_unlock(&unused_list_lock);
1085                         return bh;
1086                 }
1087                 spin_unlock(&unused_list_lock);
1088         }
1089 #if 0
1090         /*
1091          * (Pending further analysis ...)
1092          * Ordinary (non-async) requests can use a different memory priority
1093          * to free up pages. Any swapping thus generated will use async
1094          * buffer heads.
1095          */
1096         if(!async &&
1097            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1098                 memset(bh, 0, sizeof(*bh));
1099                 init_waitqueue_head(&bh->b_wait);
1100                 return bh;
1101         }
1102 #endif
1103
1104         return NULL;
1105 }
1106
1107 /*
1108  * Create the appropriate buffers when given a page for data area and
1109  * the size of each buffer.. Use the bh->b_this_page linked list to
1110  * follow the buffers created.  Return NULL if unable to create more
1111  * buffers.
1112  * The async flag is used to differentiate async IO (paging, swapping)
1113  * from ordinary buffer allocations, and only async requests are allowed
1114  * to sleep waiting for buffer heads.
1115  */
1116 static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
1117 {
1118         DECLARE_WAITQUEUE(wait, current);
1119         struct buffer_head *bh, *head;
1120         long offset;
1121
1122 try_again:
1123         head = NULL;
1124         offset = PAGE_SIZE;
1125         while ((offset -= size) >= 0) {
1126                 bh = get_unused_buffer_head(async);
1127                 if (!bh)
1128                         goto no_grow;
1129
1130                 bh->b_dev = B_FREE;  /* Flag as unused */
1131                 bh->b_this_page = head;
1132                 head = bh;
1133
1134                 bh->b_state = 0;
1135                 bh->b_next_free = NULL;
1136                 bh->b_pprev = NULL;
1137                 atomic_set(&bh->b_count, 0);
1138                 bh->b_size = size;
1139
1140                 bh->b_data = (char *) (page+offset);
1141                 bh->b_list = BUF_CLEAN;
1142                 bh->b_flushtime = 0;
1143                 bh->b_end_io = end_buffer_io_bad;
1144         }
1145         return head;
1146 /*
1147  * In case anything failed, we just free everything we got.
1148  */
1149 no_grow:
1150         if (head) {
1151                 do {
1152                         bh = head;
1153                         head = head->b_this_page;
1154                         put_unused_buffer_head(bh);
1155                 } while (head);
1156
1157                 /* Wake up any waiters ... */
1158                 wake_up(&buffer_wait);
1159         }
1160
1161         /*
1162          * Return failure for non-async IO requests.  Async IO requests
1163          * are not allowed to fail, so we have to wait until buffer heads
1164          * become available.  But we don't want tasks sleeping with
1165          * partially complete buffers, so all were released above.
1166          */
1167         if (!async)
1168                 return NULL;
1169
1170         /* We're _really_ low on memory. Now we just
1171          * wait for old buffer heads to become free due to
1172          * finishing IO.  Since this is an async request and
1173          * the reserve list is empty, we're sure there are
1174          * async buffer heads in use.
1175          */
1176         run_task_queue(&tq_disk);
1177
1178         /*
1179          * Set our state for sleeping, then check again for buffer heads.
1180          * This ensures we won't miss a wake_up from an interrupt.
1181          */
1182         add_wait_queue(&buffer_wait, &wait);
1183         current->state = TASK_UNINTERRUPTIBLE;
1184         if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1185                 current->policy |= SCHED_YIELD;
1186                 schedule();
1187         }
1188         remove_wait_queue(&buffer_wait, &wait);
1189         current->state = TASK_RUNNING;
1190         goto try_again;
1191 }
1192
1193 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1194 {
1195         struct buffer_head *head, *bh, *tail;
1196         int block;
1197
1198         if (!PageLocked(page))
1199                 BUG();
1200         if (page->owner != current)
1201                 PAGE_BUG(page);
1202         /*
1203          * Allocate async buffer heads pointing to this page, just for I/O.
1204          * They show up in the buffer hash table and are registered in
1205          * page->buffers.
1206          */
1207         head = create_buffers(page_address(page), size, 1);
1208         if (page->buffers)
1209                 BUG();
1210         if (!head)
1211                 BUG();
1212         tail = head;
1213         for (bh = head; bh; bh = bh->b_this_page) {
1214                 block = *(b++);
1215
1216                 tail = bh;
1217                 init_buffer(bh, end_buffer_io_async, NULL);
1218                 bh->b_dev = dev;
1219                 bh->b_blocknr = block;
1220
1221                 /*
1222                  * When we use bmap, we define block zero to represent
1223                  * a hole.  ll_rw_page, however, may legitimately
1224                  * access block zero, and we need to distinguish the
1225                  * two cases.
1226                  */
1227                 if (bmap && !block) {
1228                         memset(bh->b_data, 0, size);
1229                         set_bit(BH_Uptodate, &bh->b_state);
1230                         continue;
1231                 }
1232                 set_bit(BH_Mapped, &bh->b_state);
1233         }
1234         tail->b_this_page = head;
1235         get_page(page);
1236         page->buffers = head;
1237         return 0;
1238 }
1239
1240 /*
1241  * We don't have to release all buffers here, but
1242  * we have to be sure that no dirty buffer is left
1243  * and no IO is going on (no buffer is locked), because
1244  * we have truncated the file and are going to free the
1245  * blocks on-disk..
1246  */
1247 int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1248 {
1249         struct buffer_head *head, *bh, *next;
1250         unsigned int curr_off = 0;
1251
1252         if (!PageLocked(page))
1253                 BUG();
1254         if (!page->buffers)
1255                 return 0;
1256
1257         head = page->buffers;
1258         bh = head;
1259         do {
1260                 unsigned int next_off = curr_off + bh->b_size;
1261                 next = bh->b_this_page;
1262
1263                 /*
1264                  * is this block fully flushed?
1265                  */
1266                 if (offset <= curr_off) {
1267                         if (buffer_mapped(bh)) {
1268                                 atomic_inc(&bh->b_count);
1269                                 wait_on_buffer(bh);
1270                                 if (bh->b_dev == B_FREE)
1271                                         BUG();
1272                                 mark_buffer_clean(bh);
1273                                 clear_bit(BH_Uptodate, &bh->b_state);
1274                                 clear_bit(BH_Mapped, &bh->b_state);
1275                                 clear_bit(BH_Req, &bh->b_state);
1276                                 bh->b_blocknr = 0;
1277                                 atomic_dec(&bh->b_count);
1278                         }
1279                 }
1280                 curr_off = next_off;
1281                 bh = next;
1282         } while (bh != head);
1283
1284         /*
1285          * subtle. We release buffer-heads only if this is
1286          * the 'final' flushpage. We have invalidated the bmap
1287          * cached value unconditionally, so real IO is not
1288          * possible anymore.
1289          *
1290          * If the free doesn't work out, the buffers can be
1291          * left around - they just turn into anonymous buffers
1292          * instead.
1293          */
1294         if (!offset) {
1295                 if (!try_to_free_buffers(page))
1296                         atomic_add(PAGE_CACHE_SIZE, &buffermem);
1297         }
1298
1299         return 0;
1300 }
1301
1302 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1303 {
1304         struct buffer_head *bh, *head, *tail;
1305
1306         head = create_buffers(page_address(page), blocksize, 1);
1307         if (page->buffers)
1308                 BUG();
1309
1310         bh = head;
1311         do {
1312                 bh->b_dev = inode->i_dev;
1313                 bh->b_blocknr = 0;
1314                 bh->b_end_io = end_buffer_io_bad;
1315                 tail = bh;
1316                 bh = bh->b_this_page;
1317         } while (bh);
1318         tail->b_this_page = head;
1319         page->buffers = head;
1320         get_page(page);
1321 }
1322
1323 /*
1324  * block_write_full_page() is SMP-safe - currently it's still
1325  * being called with the kernel lock held, but the code is ready.
1326  */
1327 int block_write_full_page(struct file *file, struct page *page)
1328 {
1329         struct dentry *dentry = file->f_dentry;
1330         struct inode *inode = dentry->d_inode;
1331         int err, i;
1332         unsigned long block, offset;
1333         struct buffer_head *bh, *head;
1334
1335         if (!PageLocked(page))
1336                 BUG();
1337
1338         if (!page->buffers)
1339                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1340         head = page->buffers;
1341
1342         offset = page->offset;
1343         block = offset >> inode->i_sb->s_blocksize_bits;
1344
1345         // FIXME: currently we assume page alignment.
1346         if (offset & (PAGE_SIZE-1))
1347                 BUG();
1348
1349         bh = head;
1350         i = 0;
1351         do {
1352                 if (!bh)
1353                         BUG();
1354
1355                 /*
1356                  * If the buffer isn't up-to-date, we can't be sure
1357                  * that the buffer has been initialized with the proper
1358                  * block number information etc..
1359                  *
1360                  * Leave it to the low-level FS to make all those
1361                  * decisions (block #0 may actually be a valid block)
1362                  */
1363                 bh->b_end_io = end_buffer_io_sync;
1364                 if (!buffer_mapped(bh)) {
1365                         err = inode->i_op->get_block(inode, block, bh, 1);
1366                         if (err)
1367                                 goto out;
1368                 }
1369                 set_bit(BH_Uptodate, &bh->b_state);
1370                 mark_buffer_dirty(bh,0);
1371
1372                 bh = bh->b_this_page;
1373                 block++;
1374         } while (bh != head);
1375
1376         SetPageUptodate(page);
1377         return 0;
1378 out:
1379         ClearPageUptodate(page);
1380         return err;
1381 }
1382
1383 int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
1384 {
1385         struct dentry *dentry = file->f_dentry;
1386         struct inode *inode = dentry->d_inode;
1387         unsigned long block;
1388         int err, partial;
1389         unsigned long blocksize, start_block, end_block;
1390         unsigned long start_offset, start_bytes, end_bytes;
1391         unsigned long bbits, blocks, i, len;
1392         struct buffer_head *bh, *head;
1393         char * target_buf;
1394
1395         target_buf = (char *)page_address(page) + offset;
1396
1397         if (!PageLocked(page))
1398                 BUG();
1399
1400         blocksize = inode->i_sb->s_blocksize;
1401         if (!page->buffers)
1402                 create_empty_buffers(page, inode, blocksize);
1403         head = page->buffers;
1404
1405         bbits = inode->i_sb->s_blocksize_bits;
1406         block = page->offset >> bbits;
1407         blocks = PAGE_SIZE >> bbits;
1408         start_block = offset >> bbits;
1409         end_block = (offset + bytes - 1) >> bbits;
1410         start_offset = offset & (blocksize - 1);
1411         start_bytes = blocksize - start_offset;
1412         if (start_bytes > bytes)
1413                 start_bytes = bytes;
1414         end_bytes = (offset+bytes) & (blocksize - 1);
1415         if (end_bytes > bytes)
1416                 end_bytes = bytes;
1417
1418         if (offset < 0 || offset >= PAGE_SIZE)
1419                 BUG();
1420         if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1421                 BUG();
1422         if (start_block < 0 || start_block >= blocks)
1423                 BUG();
1424         if (end_block < 0 || end_block >= blocks)
1425                 BUG();
1426         // FIXME: currently we assume page alignment.
1427         if (page->offset & (PAGE_SIZE-1))
1428                 BUG();
1429
1430         i = 0;
1431         bh = head;
1432         partial = 0;
1433         do {
1434                 if (!bh)
1435                         BUG();
1436
1437                 if ((i < start_block) || (i > end_block)) {
1438                         if (!buffer_uptodate(bh))
1439                                 partial = 1;
1440                         goto skip;
1441                 }
1442
1443                 /*
1444                  * If the buffer is not up-to-date, we need to ask the low-level
1445                  * FS to do something for us (we used to have assumptions about
1446                  * the meaning of b_blocknr etc, that's bad).
1447                  *
1448                  * If "update" is set, that means that the low-level FS should
1449                  * try to make sure that the block is up-to-date because we're
1450                  * not going to fill it completely.
1451                  */
1452                 bh->b_end_io = end_buffer_io_sync;
1453                 if (!buffer_mapped(bh)) {
1454                         err = inode->i_op->get_block(inode, block, bh, 1);
1455                         if (err)
1456                                 goto out;
1457                 }
1458
1459                 if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1460                         if (buffer_new(bh)) {
1461                                 memset(bh->b_data, 0, bh->b_size);
1462                         } else {
1463                                 ll_rw_block(READ, 1, &bh);
1464                                 wait_on_buffer(bh);
1465                                 err = -EIO;
1466                                 if (!buffer_uptodate(bh))
1467                                         goto out;
1468                         }
1469                 }
1470
1471                 len = blocksize;
1472                 if (start_offset) {
1473                         len = start_bytes;
1474                         start_offset = 0;
1475                 } else if (end_bytes && (i == end_block)) {
1476                         len = end_bytes;
1477                         end_bytes = 0;
1478                 }
1479                 err = copy_from_user(target_buf, buf, len);
1480                 target_buf += len;
1481                 buf += len;
1482
1483                 /*
1484                  * we dirty buffers only after copying the data into
1485                  * the page - this way we can dirty the buffer even if
1486                  * the bh is still doing IO.
1487                  *
1488                  * NOTE! This also does a direct dirty balace check,
1489                  * rather than relying on bdflush just waking up every
1490                  * once in a while. This is to catch (and slow down)
1491                  * the processes that write tons of buffer..
1492                  *
1493                  * Note how we do NOT want to do this in the full block
1494                  * case: full pages are flushed not by the people who
1495                  * dirtied them, but by people who need memory. And we
1496                  * should not penalize them for somebody else writing
1497                  * lots of dirty pages.
1498                  */
1499                 set_bit(BH_Uptodate, &bh->b_state);
1500                 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1501                         __mark_dirty(bh, 0);
1502                         if (too_many_dirty_buffers)
1503                                 balance_dirty(bh->b_dev);
1504                 }
1505
1506                 if (err) {
1507                         err = -EFAULT;
1508                         goto out;
1509                 }
1510
1511 skip:
1512                 i++;
1513                 block++;
1514                 bh = bh->b_this_page;
1515         } while (bh != head);
1516
1517         /*
1518          * is this a partial write that happened to make all buffers
1519          * uptodate then we can optimize away a bogus readpage() for
1520          * the next read(). Here we 'discover' wether the page went
1521          * uptodate as a result of this (potentially partial) write.
1522          */
1523         if (!partial)
1524                 SetPageUptodate(page);
1525         return bytes;
1526 out:
1527         ClearPageUptodate(page);
1528         return err;
1529 }
1530
1531
1532 /*
1533  * IO completion routine for a buffer_head being used for kiobuf IO: we
1534  * can't dispatch the kiobuf callback until io_count reaches 0.
1535  */
1536
1537 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1538 {
1539         struct kiobuf *kiobuf;
1540
1541         mark_buffer_uptodate(bh, uptodate);
1542
1543         kiobuf = bh->b_kiobuf;
1544         if (atomic_dec_and_test(&kiobuf->io_count))
1545                 kiobuf->end_io(kiobuf);
1546         if (!uptodate)
1547                 kiobuf->errno = -EIO;
1548 }
1549
1550
1551 /*
1552  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1553  * for them to complete.  Clean up the buffer_heads afterwards.
1554  */
1555
1556 #define dprintk(x...)
1557
1558 static int do_kio(struct kiobuf *kiobuf,
1559                   int rw, int nr, struct buffer_head *bh[], int size)
1560 {
1561         int iosize;
1562         int i;
1563         struct buffer_head *tmp;
1564
1565         struct task_struct *tsk = current;
1566         DECLARE_WAITQUEUE(wait, tsk);
1567
1568         dprintk ("do_kio start %d\n", rw);
1569
1570         if (rw == WRITE)
1571                 rw = WRITERAW;
1572         atomic_add(nr, &kiobuf->io_count);
1573         kiobuf->errno = 0;
1574         ll_rw_block(rw, nr, bh);
1575
1576         kiobuf_wait_for_io(kiobuf);
1577
1578         spin_lock(&unused_list_lock);
1579
1580         iosize = 0;
1581         for (i = nr; --i >= 0; ) {
1582                 iosize += size;
1583                 tmp = bh[i];
1584                 if (!buffer_uptodate(tmp)) {
1585                         /* We are traversing bh'es in reverse order so
1586                            clearing iosize on error calculates the
1587                            amount of IO before the first error. */
1588                         iosize = 0;
1589                 }
1590                 __put_unused_buffer_head(tmp);
1591         }
1592
1593         spin_unlock(&unused_list_lock);
1594
1595         dprintk ("do_kio end %d %d\n", iosize, err);
1596
1597         if (iosize)
1598                 return iosize;
1599         if (kiobuf->errno)
1600                 return kiobuf->errno;
1601         return -EIO;
1602 }
1603
1604 /*
1605  * Start I/O on a physical range of kernel memory, defined by a vector
1606  * of kiobuf structs (much like a user-space iovec list).
1607  *
1608  * The kiobuf must already be locked for IO.  IO is submitted
1609  * asynchronously: you need to check page->locked, page->uptodate, and
1610  * maybe wait on page->wait.
1611  *
1612  * It is up to the caller to make sure that there are enough blocks
1613  * passed in to completely map the iobufs to disk.
1614  */
1615
1616 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1617                kdev_t dev, unsigned long b[], int size, int bmap)
1618 {
1619         int             err;
1620         int             length;
1621         int             transferred;
1622         int             i;
1623         int             bufind;
1624         int             pageind;
1625         int             bhind;
1626         int             offset;
1627         unsigned long   blocknr;
1628         struct kiobuf * iobuf = NULL;
1629         unsigned long   page;
1630         struct page *   map;
1631         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1632
1633         if (!nr)
1634                 return 0;
1635
1636         /*
1637          * First, do some alignment and validity checks
1638          */
1639         for (i = 0; i < nr; i++) {
1640                 iobuf = iovec[i];
1641                 if ((iobuf->offset & (size-1)) ||
1642                     (iobuf->length & (size-1)))
1643                         return -EINVAL;
1644                 if (!iobuf->locked)
1645                         panic("brw_kiovec: iobuf not locked for I/O");
1646                 if (!iobuf->nr_pages)
1647                         panic("brw_kiovec: iobuf not initialised");
1648         }
1649
1650         /* DEBUG */
1651 #if 0
1652         return iobuf->length;
1653 #endif
1654         dprintk ("brw_kiovec: start\n");
1655
1656         /*
1657          * OK to walk down the iovec doing page IO on each page we find.
1658          */
1659         bufind = bhind = transferred = err = 0;
1660         for (i = 0; i < nr; i++) {
1661                 iobuf = iovec[i];
1662                 offset = iobuf->offset;
1663                 length = iobuf->length;
1664                 dprintk ("iobuf %d %d %d\n", offset, length, size);
1665
1666                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1667                         page = iobuf->pagelist[pageind];
1668                         map  = iobuf->maplist[pageind];
1669
1670                         while (length > 0) {
1671                                 blocknr = b[bufind++];
1672                                 tmp = get_unused_buffer_head(0);
1673                                 if (!tmp) {
1674                                         err = -ENOMEM;
1675                                         goto error;
1676                                 }
1677
1678                                 tmp->b_dev = B_FREE;
1679                                 tmp->b_size = size;
1680                                 tmp->b_data = (char *) (page + offset);
1681                                 tmp->b_this_page = tmp;
1682
1683                                 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1684                                 tmp->b_dev = dev;
1685                                 tmp->b_blocknr = blocknr;
1686                                 tmp->b_state = 1 << BH_Mapped;
1687                                 tmp->b_kiobuf = iobuf;
1688
1689                                 if (rw == WRITE) {
1690                                         set_bit(BH_Uptodate, &tmp->b_state);
1691                                         set_bit(BH_Dirty, &tmp->b_state);
1692                                 }
1693
1694                                 dprintk ("buffer %d (%d) at %p\n",
1695                                          bhind, tmp->b_blocknr, tmp->b_data);
1696                                 bh[bhind++] = tmp;
1697                                 length -= size;
1698                                 offset += size;
1699
1700                                 /*
1701                                  * Start the IO if we have got too much
1702                                  */
1703                                 if (bhind >= KIO_MAX_SECTORS) {
1704                                         err = do_kio(iobuf, rw, bhind, bh, size);
1705                                         if (err >= 0)
1706                                                 transferred += err;
1707                                         else
1708                                                 goto finished;
1709                                         bhind = 0;
1710                                 }
1711
1712                                 if (offset >= PAGE_SIZE) {
1713                                         offset = 0;
1714                                         break;
1715                                 }
1716                         } /* End of block loop */
1717                 } /* End of page loop */
1718         } /* End of iovec loop */
1719
1720         /* Is there any IO still left to submit? */
1721         if (bhind) {
1722                 err = do_kio(iobuf, rw, bhind, bh, size);
1723                 if (err >= 0)
1724                         transferred += err;
1725                 else
1726                         goto finished;
1727         }
1728
1729  finished:
1730         dprintk ("brw_kiovec: end (%d, %d)\n", transferred, err);
1731         if (transferred)
1732                 return transferred;
1733         return err;
1734
1735  error:
1736         /* We got an error allocation the bh'es.  Just free the current
1737            buffer_heads and exit. */
1738         spin_lock(&unused_list_lock);
1739         for (i = bhind; --i >= 0; ) {
1740                 __put_unused_buffer_head(bh[bhind]);
1741         }
1742         spin_unlock(&unused_list_lock);
1743         goto finished;
1744 }
1745
1746 /*
1747  * Start I/O on a page.
1748  * This function expects the page to be locked and may return
1749  * before I/O is complete. You then have to check page->locked,
1750  * page->uptodate, and maybe wait on page->wait.
1751  *
1752  * brw_page() is SMP-safe, although it's being called with the
1753  * kernel lock held - but the code is ready.
1754  *
1755  * FIXME: we need a swapper_inode->get_block function to remove
1756  *        some of the bmap kludges and interface ugliness here.
1757  */
1758 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1759 {
1760         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1761         int nr, fresh /* temporary debugging flag */, block;
1762
1763         if (!PageLocked(page))
1764                 panic("brw_page: page not locked for I/O");
1765 //      clear_bit(PG_error, &page->flags);
1766         /*
1767          * We pretty much rely on the page lock for this, because
1768          * create_page_buffers() might sleep.
1769          */
1770         fresh = 0;
1771         if (!page->buffers) {
1772                 create_page_buffers(rw, page, dev, b, size, bmap);
1773                 fresh = 1;
1774         }
1775         if (!page->buffers)
1776                 BUG();
1777         page->owner = (void *)-1;
1778
1779         head = page->buffers;
1780         bh = head;
1781         nr = 0;
1782         do {
1783                 block = *(b++);
1784
1785                 if (fresh && (atomic_read(&bh->b_count) != 0))
1786                         BUG();
1787                 if (rw == READ) {
1788                         if (!fresh)
1789                                 BUG();
1790                         if (bmap && !block) {
1791                                 if (block)
1792                                         BUG();
1793                         } else {
1794                                 if (bmap && !block)
1795                                         BUG();
1796                                 if (!buffer_uptodate(bh)) {
1797                                         arr[nr++] = bh;
1798                                         atomic_inc(&bh->b_count);
1799                                 }
1800                         }
1801                 } else { /* WRITE */
1802                         if (!bh->b_blocknr) {
1803                                 if (!block)
1804                                         BUG();
1805                                 bh->b_blocknr = block;
1806                         } else {
1807                                 if (!block)
1808                                         BUG();
1809                         }
1810                         set_bit(BH_Uptodate, &bh->b_state);
1811                         set_bit(BH_Dirty, &bh->b_state);
1812                         arr[nr++] = bh;
1813                         atomic_inc(&bh->b_count);
1814                 }
1815                 bh = bh->b_this_page;
1816         } while (bh != head);
1817         if (rw == READ)
1818                 ++current->maj_flt;
1819         if ((rw == READ) && nr) {
1820                 if (Page_Uptodate(page))
1821                         BUG();
1822                 ll_rw_block(rw, nr, arr);
1823         } else {
1824                 if (!nr && rw == READ) {
1825                         SetPageUptodate(page);
1826                         page->owner = current;
1827                         UnlockPage(page);
1828                 }
1829                 if (nr && (rw == WRITE))
1830                         ll_rw_block(rw, nr, arr);
1831         }
1832         return 0;
1833 }
1834
1835 /*
1836  * Generic "read page" function for block devices that have the normal
1837  * bmap functionality. This is most of the block device filesystems.
1838  * Reads the page asynchronously --- the unlock_buffer() and
1839  * mark_buffer_uptodate() functions propagate buffer state into the
1840  * page struct once IO has completed.
1841  */
1842 int block_read_full_page(struct file * file, struct page * page)
1843 {
1844         struct dentry *dentry = file->f_dentry;
1845         struct inode *inode = dentry->d_inode;
1846         unsigned long iblock;
1847         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1848         unsigned int blocksize, blocks;
1849         int nr;
1850
1851         if (!PageLocked(page))
1852                 PAGE_BUG(page);
1853         blocksize = inode->i_sb->s_blocksize;
1854         if (!page->buffers)
1855                 create_empty_buffers(page, inode, blocksize);
1856         head = page->buffers;
1857
1858         blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1859         iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1860         page->owner = (void *)-1;
1861         head = page->buffers;
1862         bh = head;
1863         nr = 0;
1864
1865         do {
1866                 if (buffer_uptodate(bh))
1867                         continue;
1868
1869                 if (!buffer_mapped(bh)) {
1870                         inode->i_op->get_block(inode, iblock, bh, 0);
1871                         if (!buffer_mapped(bh)) {
1872                                 memset(bh->b_data, 0, blocksize);
1873                                 set_bit(BH_Uptodate, &bh->b_state);
1874                                 continue;
1875                         }
1876                 }
1877
1878                 init_buffer(bh, end_buffer_io_async, NULL);
1879                 atomic_inc(&bh->b_count);
1880                 arr[nr] = bh;
1881                 nr++;
1882         } while (iblock++, (bh = bh->b_this_page) != head);
1883
1884         ++current->maj_flt;
1885         if (nr) {
1886                 if (Page_Uptodate(page))
1887                         BUG();
1888                 ll_rw_block(READ, nr, arr);
1889         } else {
1890                 /*
1891                  * all buffers are uptodate - we can set the page
1892                  * uptodate as well.
1893                  */
1894                 SetPageUptodate(page);
1895                 page->owner = current;
1896                 UnlockPage(page);
1897         }
1898         return 0;
1899 }
1900
1901 /*
1902  * Try to increase the number of buffers available: the size argument
1903  * is used to determine what kind of buffers we want.
1904  */
1905 static int grow_buffers(int size)
1906 {
1907         unsigned long page;
1908         struct buffer_head *bh, *tmp;
1909         struct buffer_head * insert_point;
1910         int isize;
1911
1912         if ((size & 511) || (size > PAGE_SIZE)) {
1913                 printk("VFS: grow_buffers: size = %d\n",size);
1914                 return 0;
1915         }
1916
1917         if (!(page = __get_free_page(GFP_BUFFER)))
1918                 return 0;
1919         bh = create_buffers(page, size, 0);
1920         if (!bh) {
1921                 free_page(page);
1922                 return 0;
1923         }
1924
1925         isize = BUFSIZE_INDEX(size);
1926
1927         spin_lock(&free_list[isize].lock);
1928         insert_point = free_list[isize].list;
1929         tmp = bh;
1930         while (1) {
1931                 if (insert_point) {
1932                         tmp->b_next_free = insert_point->b_next_free;
1933                         tmp->b_prev_free = insert_point;
1934                         insert_point->b_next_free->b_prev_free = tmp;
1935                         insert_point->b_next_free = tmp;
1936                 } else {
1937                         tmp->b_prev_free = tmp;
1938                         tmp->b_next_free = tmp;
1939                 }
1940                 insert_point = tmp;
1941                 if (tmp->b_this_page)
1942                         tmp = tmp->b_this_page;
1943                 else
1944                         break;
1945         }
1946         tmp->b_this_page = bh;
1947         free_list[isize].list = bh;
1948         spin_unlock(&free_list[isize].lock);
1949
1950         mem_map[MAP_NR(page)].buffers = bh;
1951         atomic_add(PAGE_SIZE, &buffermem);
1952         return 1;
1953 }
1954
1955 /*
1956  * Can the buffer be thrown out?
1957  */
1958 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1959 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1960
1961 /*
1962  * try_to_free_buffers() checks if all the buffers on this particular page
1963  * are unused, and free's the page if so.
1964  *
1965  * Wake up bdflush() if this fails - if we're running low on memory due
1966  * to dirty buffers, we need to flush them out as quickly as possible.
1967  *
1968  * NOTE: There are quite a number of ways that threads of control can
1969  *       obtain a reference to a buffer head within a page.  So we must
1970  *       lock out all of these paths to cleanly toss the page.
1971  */
1972 int try_to_free_buffers(struct page * page)
1973 {
1974         struct buffer_head * tmp, * bh = page->buffers;
1975         int index = BUFSIZE_INDEX(bh->b_size);
1976         int ret;
1977
1978         spin_lock(&lru_list_lock);
1979         write_lock(&hash_table_lock);
1980         spin_lock(&free_list[index].lock);
1981         tmp = bh;
1982         do {
1983                 struct buffer_head * p = tmp;
1984
1985                 tmp = tmp->b_this_page;
1986                 if (buffer_busy(p))
1987                         goto busy_buffer_page;
1988         } while (tmp != bh);
1989
1990         spin_lock(&unused_list_lock);
1991         tmp = bh;
1992         do {
1993                 struct buffer_head * p = tmp;
1994                 tmp = tmp->b_this_page;
1995
1996                 /* The buffer can be either on the regular
1997                  * queues or on the free list..
1998                  */
1999                 if (p->b_dev == B_FREE) {
2000                         __remove_from_free_list(p, index);
2001                 } else {
2002                         if (p->b_pprev)
2003                                 __hash_unlink(p);
2004                         __remove_from_lru_list(p, p->b_list);
2005                 }
2006                 __put_unused_buffer_head(p);
2007         } while (tmp != bh);
2008         spin_unlock(&unused_list_lock);
2009
2010         /* Wake up anyone waiting for buffer heads */
2011         wake_up(&buffer_wait);
2012
2013         /* And free the page */
2014         page->buffers = NULL;
2015         __free_page(page);
2016         ret = 1;
2017 out:
2018         spin_unlock(&free_list[index].lock);
2019         write_unlock(&hash_table_lock);
2020         spin_unlock(&lru_list_lock);
2021         return ret;
2022
2023 busy_buffer_page:
2024         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2025         too_many_dirty_buffers = 1;
2026         wakeup_bdflush(0);
2027         ret = 0;
2028         goto out;
2029 }
2030
2031 /* ===================== Init ======================= */
2032
2033 /*
2034  * allocate the hash table and init the free list
2035  * Use gfp() for the hash table to decrease TLB misses, use
2036  * SLAB cache for buffer heads.
2037  */
2038 void __init buffer_init(unsigned long memory_size)
2039 {
2040         int order, i;
2041         unsigned int nr_hash;
2042
2043         /* The buffer cache hash table is less important these days,
2044          * trim it a bit.
2045          */
2046         memory_size >>= 14;
2047         memory_size *= sizeof(struct buffer_head *);
2048         for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
2049                 ;
2050
2051         /* try to allocate something until we get it or we're asking
2052            for something that is really too small */
2053
2054         do {
2055                 unsigned long tmp;
2056
2057                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2058                 bh_hash_mask = (nr_hash - 1);
2059
2060                 tmp = nr_hash;
2061                 bh_hash_shift = 0;
2062                 while((tmp >>= 1UL) != 0UL)
2063                         bh_hash_shift++;
2064
2065                 hash_table = (struct buffer_head **)
2066                     __get_free_pages(GFP_ATOMIC, order);
2067         } while (hash_table == NULL && --order > 0);
2068         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2069                nr_hash, order, (1UL<<order) * PAGE_SIZE);
2070
2071         if (!hash_table)
2072                 panic("Failed to allocate buffer hash table\n");
2073
2074         /* Setup hash chains. */
2075         for(i = 0; i < nr_hash; i++)
2076                 hash_table[i] = NULL;
2077
2078         /* Setup free lists. */
2079         for(i = 0; i < NR_SIZES; i++) {
2080                 free_list[i].list = NULL;
2081                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2082         }
2083
2084         /* Setup lru lists. */
2085         for(i = 0; i < NR_LIST; i++)
2086                 lru_list[i] = NULL;
2087
2088         bh_cachep = kmem_cache_create("buffer_head",
2089                                       sizeof(struct buffer_head),
2090                                       0,
2091                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
2092         if(!bh_cachep)
2093                 panic("Cannot create buffer head SLAB cache\n");
2094 }
2095
2096
2097 /* ====================== bdflush support =================== */
2098
2099 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2100  * response to dirty buffers.  Once this process is activated, we write back
2101  * a limited number of buffers to the disks and then go back to sleep again.
2102  */
2103 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2104 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2105 struct task_struct *bdflush_tsk = 0;
2106
2107 void wakeup_bdflush(int wait)
2108 {
2109         if (current == bdflush_tsk)
2110                 return;
2111         if (wait)
2112                 run_task_queue(&tq_disk);
2113         wake_up(&bdflush_wait);
2114         if (wait)
2115                 sleep_on(&bdflush_done);
2116 }
2117
2118
2119 /*
2120  * Here we attempt to write back old buffers.  We also try to flush inodes
2121  * and supers as well, since this function is essentially "update", and
2122  * otherwise there would be no way of ensuring that these quantities ever
2123  * get written back.  Ideally, we would have a timestamp on the inodes
2124  * and superblocks so that we could write back only the old ones as well
2125  */
2126
2127 static int sync_old_buffers(void)
2128 {
2129         int nlist;
2130
2131         lock_kernel();
2132         sync_supers(0);
2133         sync_inodes(0);
2134         unlock_kernel();
2135
2136         for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2137                 struct buffer_head *bh;
2138         repeat:
2139                 spin_lock(&lru_list_lock);
2140                 bh = lru_list[nlist];
2141                 if(bh) {
2142                         struct buffer_head *next;
2143                         int i;
2144                         for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
2145                                 next = bh->b_next_free;
2146
2147                                 /* If the buffer is not on the proper list,
2148                                  * then refile it.
2149                                  */
2150                                 if ((nlist == BUF_DIRTY &&
2151                                      (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2152                                     (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2153                                         __refile_buffer(bh);
2154                                         continue;
2155                                 }
2156
2157                                 if (buffer_locked(bh) || !buffer_dirty(bh))
2158                                         continue;
2159
2160                                 /* OK, now we are committed to write it out. */
2161                                 bh->b_flushtime = 0;
2162                                 atomic_inc(&bh->b_count);
2163                                 spin_unlock(&lru_list_lock);
2164                                 ll_rw_block(WRITE, 1, &bh);
2165                                 atomic_dec(&bh->b_count);
2166                                 goto repeat;
2167                         }
2168                 }
2169                 spin_unlock(&lru_list_lock);
2170         }
2171         run_task_queue(&tq_disk);
2172         return 0;
2173 }
2174
2175 /* This is the interface to bdflush.  As we get more sophisticated, we can
2176  * pass tuning parameters to this "process", to adjust how it behaves.
2177  * We would want to verify each parameter, however, to make sure that it
2178  * is reasonable. */
2179
2180 asmlinkage int sys_bdflush(int func, long data)
2181 {
2182         if (!capable(CAP_SYS_ADMIN))
2183                 return -EPERM;
2184
2185         if (func == 1) {
2186                 int error;
2187                 struct mm_struct *user_mm;
2188
2189                 /*
2190                  * bdflush will spend all of it's time in kernel-space,
2191                  * without touching user-space, so we can switch it into
2192                  * 'lazy TLB mode' to reduce the cost of context-switches
2193                  * to and from bdflush.
2194                  */
2195                 user_mm = start_lazy_tlb();
2196                 error = sync_old_buffers();
2197                 end_lazy_tlb(user_mm);
2198                 return error;
2199         }
2200
2201         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2202         if (func >= 2) {
2203                 int i = (func-2) >> 1;
2204                 if (i >= 0 && i < N_PARAM) {
2205                         if ((func & 1) == 0)
2206                                 return put_user(bdf_prm.data[i], (int*)data);
2207
2208                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2209                                 bdf_prm.data[i] = data;
2210                                 return 0;
2211                         }
2212                 }
2213                 return -EINVAL;
2214         }
2215
2216         /* Having func 0 used to launch the actual bdflush and then never
2217          * return (unless explicitly killed). We return zero here to
2218          * remain semi-compatible with present update(8) programs.
2219          */
2220         return 0;
2221 }
2222
2223 /*
2224  * This is the actual bdflush daemon itself. It used to be started from
2225  * the syscall above, but now we launch it ourselves internally with
2226  * kernel_thread(...)  directly after the first thread in init/main.c
2227  */
2228 int bdflush(void * unused)
2229 {
2230         /*
2231          *      We have a bare-bones task_struct, and really should fill
2232          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2233          *      display semi-sane things. Not real crucial though...
2234          */
2235
2236         current->session = 1;
2237         current->pgrp = 1;
2238         sprintf(current->comm, "kflushd");
2239         bdflush_tsk = current;
2240
2241         for (;;) {
2242                 int nlist;
2243
2244                 CHECK_EMERGENCY_SYNC
2245
2246                 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2247                         int nr, major, written = 0;
2248                         struct buffer_head *next;
2249
2250                 repeat:
2251                         spin_lock(&lru_list_lock);
2252                         next = lru_list[nlist];
2253                         nr = nr_buffers_type[nlist];
2254                         while (nr-- > 0) {
2255                                 struct buffer_head *bh = next;
2256
2257                                 next = next->b_next_free;
2258
2259                                 /* If the buffer is not on the correct list,
2260                                  * then refile it.
2261                                  */
2262                                 if ((nlist == BUF_DIRTY &&
2263                                      (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2264                                     (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2265                                         __refile_buffer(bh);
2266                                         continue;
2267                                 }
2268
2269                                 /* If we aren't in panic mode, don't write out too much
2270                                  * at a time. Also, don't write out buffers we don't
2271                                  * really have to write out yet..
2272                                  */
2273                                 if (!too_many_dirty_buffers) {
2274                                         if (written > bdf_prm.b_un.ndirty)
2275                                                 break;
2276                                         if (time_before(jiffies, bh->b_flushtime))
2277                                                 continue;
2278                                 }
2279
2280                                 if (buffer_locked(bh) || !buffer_dirty(bh))
2281                                          continue;
2282
2283                                 major = MAJOR(bh->b_dev);
2284                                 written++;
2285                                 bh->b_flushtime = 0;
2286
2287                                 /*
2288                                  * For the loop major we can try to do asynchronous writes,
2289                                  * but we have to guarantee that we're making some progress..
2290                                  */
2291                                 atomic_inc(&bh->b_count);
2292                                 spin_unlock(&lru_list_lock);
2293                                 if (major == LOOP_MAJOR && written > 1) {
2294                                         ll_rw_block(WRITEA, 1, &bh);
2295                                         if (buffer_dirty(bh))
2296                                                 --written;
2297                                 } else
2298                                         ll_rw_block(WRITE, 1, &bh);
2299                                 atomic_dec(&bh->b_count);
2300                                 goto repeat;
2301                         }
2302                         spin_unlock(&lru_list_lock);
2303                 }
2304                 run_task_queue(&tq_disk);
2305                 wake_up(&bdflush_done);
2306
2307                 /*
2308                  * If there are still a lot of dirty buffers around,
2309                  * skip the sleep and flush some more. Otherwise, we
2310                  * sleep for a while and mark us as not being in panic
2311                  * mode..
2312                  */
2313                 if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2314                         too_many_dirty_buffers = 0;
2315                         spin_lock_irq(&current->sigmask_lock);
2316                         flush_signals(current);
2317                         spin_unlock_irq(&current->sigmask_lock);
2318                         interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
2319                 }
2320         }
2321 }