fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 #include <linux/sched.h>
  30 #include <linux/fs.h>
  31 #include <linux/malloc.h>
  32 #include <linux/locks.h>
  33 #include <linux/errno.h>
  34 #include <linux/swap.h>
  35 #include <linux/swapctl.h>
  36 #include <linux/smp_lock.h>
  37 #include <linux/vmalloc.h>
  38 #include <linux/blkdev.h>
  39 #include <linux/sysrq.h>
  40 #include <linux/file.h>
  41 #include <linux/init.h>
  42 #include <linux/quotaops.h>
  43
  44 #include <asm/uaccess.h>
  45 #include <asm/io.h>
  46 #include <asm/bitops.h>
  47 #include <asm/mmu_context.h>
  48
  49 #define NR_SIZES 7
  50 static char buffersize_index[65] =
  51 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  52   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  53   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  54  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  55   6};
  56
  57 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  58 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  59 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  60 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  61                                              number of unused buffer heads */
  62
  63 /* Anti-deadlock ordering:
  64  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  65  */
  66
  67 /*
  68  * Hash table gook..
  69  */
  70 static unsigned int bh_hash_mask = 0;
  71 static unsigned int bh_hash_shift = 0;
  72 static struct buffer_head **hash_table;
  73 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  74
  75 static struct buffer_head *lru_list[NR_LIST];
  76 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  77 static int nr_buffers_type[NR_LIST] = {0,};
  78
  79 static struct buffer_head * unused_list = NULL;
  80 static int nr_unused_buffer_heads = 0;
  81 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  82 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  83
  84 struct bh_free_head {
  85         struct buffer_head *list;
  86         spinlock_t lock;
  87 };
  88 static struct bh_free_head free_list[NR_SIZES];
  89
  90 static kmem_cache_t *bh_cachep;
  91
  92 static int grow_buffers(int size);
  93
  94 /* This is used by some architectures to estimate available memory. */
  95 atomic_t buffermem = ATOMIC_INIT(0);
  96
  97 /* Here is the parameter block for the bdflush process. If you add or
  98  * remove any of the parameters, make sure to update kernel/sysctl.c.
  99  */
 100
 101 #define N_PARAM 9
 102
 103 /* The dummy values in this structure are left in there for compatibility
 104  * with old programs that play with the /proc entries.
 105  */
 106 union bdflush_param {
 107         struct {
 108                 int nfract;  /* Percentage of buffer cache dirty to
 109                                 activate bdflush */
 110                 int ndirty;  /* Maximum number of dirty blocks to write out per
 111                                 wake-cycle */
 112                 int nrefill; /* Number of clean buffers to try to obtain
 113                                 each time we call refill */
 114                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 115                                   when trying to refill buffers. */
 116                 int dummy1;    /* unused */
 117                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 118                 int age_super;  /* Time for superblock to age before we flush it */
 119                 int dummy2;    /* unused */
 120                 int dummy3;    /* unused */
 121         } b_un;
 122         unsigned int data[N_PARAM];
 123 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
 124
 125 /* These are the min and max parameter values that we will allow to be assigned */
 126 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 127 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
 128
 129 void wakeup_bdflush(int);
 130
 131 /*
 132  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 133  * and getting rid of the cli-sti pairs. The wait-queue routines still
 134  * need cli-sti, but now it's just a couple of 386 instructions or so.
 135  *
 136  * Note that the real wait_on_buffer() is an inline function that checks
 137  * if 'b_wait' is set before calling this, so that the queues aren't set
 138  * up unnecessarily.
 139  */
 140 void __wait_on_buffer(struct buffer_head * bh)
 141 {
 142         struct task_struct *tsk = current;
 143         DECLARE_WAITQUEUE(wait, tsk);
 144
 145         atomic_inc(&bh->b_count);
 146         add_wait_queue(&bh->b_wait, &wait);
 147 repeat:
 148         tsk->state = TASK_UNINTERRUPTIBLE;
 149         run_task_queue(&tq_disk);
 150         if (buffer_locked(bh)) {
 151                 schedule();
 152                 goto repeat;
 153         }
 154         tsk->state = TASK_RUNNING;
 155         remove_wait_queue(&bh->b_wait, &wait);
 156         atomic_dec(&bh->b_count);
 157 }
 158
 159 /* Call sync_buffers with wait!=0 to ensure that the call does not
 160  * return until all buffer writes have completed.  Sync() may return
 161  * before the writes have finished; fsync() may not.
 162  */
 163
 164 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 165  * spontaneously dirty themselves without ever brelse being called.
 166  * We will ultimately want to put these in a separate list, but for
 167  * now we search all of the lists for dirty buffers.
 168  */
 169 static int sync_buffers(kdev_t dev, int wait)
 170 {
 171         int i, retry, pass = 0, err = 0;
 172         struct buffer_head * bh, *next;
 173
 174         /* One pass for no-wait, three for wait:
 175          * 0) write out all dirty, unlocked buffers;
 176          * 1) write out all dirty buffers, waiting if locked;
 177          * 2) wait for completion by waiting for all buffers to unlock.
 178          */
 179         do {
 180                 retry = 0;
 181
 182                 /* We search all lists as a failsafe mechanism, not because we expect
 183                  * there to be dirty buffers on any of the other lists.
 184                  */
 185 repeat:
 186                 spin_lock(&lru_list_lock);
 187                 bh = lru_list[BUF_DIRTY];
 188                 if (!bh)
 189                         goto repeat2;
 190
 191                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 192                         next = bh->b_next_free;
 193
 194                         if (!lru_list[BUF_DIRTY])
 195                                 break;
 196                         if (dev && bh->b_dev != dev)
 197                                 continue;
 198                         if (buffer_locked(bh)) {
 199                                 /* Buffer is locked; skip it unless wait is
 200                                  * requested AND pass > 0.
 201                                  */
 202                                 if (!wait || !pass) {
 203                                         retry = 1;
 204                                         continue;
 205                                 }
 206                                 atomic_inc(&bh->b_count);
 207                                 spin_unlock(&lru_list_lock);
 208                                 wait_on_buffer (bh);
 209                                 atomic_dec(&bh->b_count);
 210                                 goto repeat;
 211                         }
 212
 213                         /* If an unlocked buffer is not uptodate, there has
 214                          * been an IO error. Skip it.
 215                          */
 216                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 217                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 218                                 err = -EIO;
 219                                 continue;
 220                         }
 221
 222                         /* Don't write clean buffers.  Don't write ANY buffers
 223                          * on the third pass.
 224                          */
 225                         if (!buffer_dirty(bh) || pass >= 2)
 226                                 continue;
 227
 228                         atomic_inc(&bh->b_count);
 229                         bh->b_flushtime = 0;
 230                         spin_unlock(&lru_list_lock);
 231                         ll_rw_block(WRITE, 1, &bh);
 232                         atomic_dec(&bh->b_count);
 233                         retry = 1;
 234                         goto repeat;
 235                 }
 236
 237     repeat2:
 238                 bh = lru_list[BUF_LOCKED];
 239                 if (!bh) {
 240                         spin_unlock(&lru_list_lock);
 241                         break;
 242                 }
 243                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 244                         next = bh->b_next_free;
 245
 246                         if (!lru_list[BUF_LOCKED])
 247                                 break;
 248                         if (dev && bh->b_dev != dev)
 249                                 continue;
 250                         if (buffer_locked(bh)) {
 251                                 /* Buffer is locked; skip it unless wait is
 252                                  * requested AND pass > 0.
 253                                  */
 254                                 if (!wait || !pass) {
 255                                         retry = 1;
 256                                         continue;
 257                                 }
 258                                 atomic_inc(&bh->b_count);
 259                                 spin_unlock(&lru_list_lock);
 260                                 wait_on_buffer (bh);
 261                                 spin_lock(&lru_list_lock);
 262                                 atomic_dec(&bh->b_count);
 263                                 goto repeat2;
 264                         }
 265                 }
 266                 spin_unlock(&lru_list_lock);
 267
 268                 /* If we are waiting for the sync to succeed, and if any dirty
 269                  * blocks were written, then repeat; on the second pass, only
 270                  * wait for buffers being written (do not pass to write any
 271                  * more buffers on the second pass).
 272                  */
 273         } while (wait && retry && ++pass<=2);
 274         return err;
 275 }
 276
 277 void sync_dev(kdev_t dev)
 278 {
 279         sync_buffers(dev, 0);
 280         sync_supers(dev);
 281         sync_inodes(dev);
 282         sync_buffers(dev, 0);
 283         DQUOT_SYNC(dev);
 284         /*
 285          * FIXME(eric) we need to sync the physical devices here.
 286          * This is because some (scsi) controllers have huge amounts of
 287          * cache onboard (hundreds of Mb), and we need to instruct
 288          * them to commit all of the dirty memory to disk, and we should
 289          * not return until this has happened.
 290          *
 291          * This would need to get implemented by going through the assorted
 292          * layers so that each block major number can be synced, and this
 293          * would call down into the upper and mid-layer scsi.
 294          */
 295 }
 296
 297 int fsync_dev(kdev_t dev)
 298 {
 299         sync_buffers(dev, 0);
 300
 301         lock_kernel();
 302         sync_supers(dev);
 303         sync_inodes(dev);
 304         DQUOT_SYNC(dev);
 305         unlock_kernel();
 306
 307         return sync_buffers(dev, 1);
 308 }
 309
 310 asmlinkage int sys_sync(void)
 311 {
 312         fsync_dev(0);
 313         return 0;
 314 }
 315
 316 /*
 317  *      filp may be NULL if called via the msync of a vma.
 318  */
 319
 320 int file_fsync(struct file *filp, struct dentry *dentry)
 321 {
 322         struct inode * inode = dentry->d_inode;
 323         struct super_block * sb;
 324         kdev_t dev;
 325
 326         /* sync the inode to buffers */
 327         write_inode_now(inode);
 328
 329         /* sync the superblock to buffers */
 330         sb = inode->i_sb;
 331         wait_on_super(sb);
 332         if (sb->s_op && sb->s_op->write_super)
 333                 sb->s_op->write_super(sb);
 334
 335         /* .. finally sync the buffers to disk */
 336         dev = inode->i_dev;
 337         return sync_buffers(dev, 1);
 338 }
 339
 340 asmlinkage int sys_fsync(unsigned int fd)
 341 {
 342         struct file * file;
 343         struct dentry * dentry;
 344         struct inode * inode;
 345         int err;
 346
 347         lock_kernel();
 348         err = -EBADF;
 349         file = fget(fd);
 350         if (!file)
 351                 goto out;
 352
 353         dentry = file->f_dentry;
 354         if (!dentry)
 355                 goto out_putf;
 356
 357         inode = dentry->d_inode;
 358         if (!inode)
 359                 goto out_putf;
 360
 361         err = -EINVAL;
 362         if (!file->f_op || !file->f_op->fsync)
 363                 goto out_putf;
 364
 365         /* We need to protect against concurrent writers.. */
 366         down(&inode->i_sem);
 367         err = file->f_op->fsync(file, dentry);
 368         up(&inode->i_sem);
 369
 370 out_putf:
 371         fput(file);
 372 out:
 373         unlock_kernel();
 374         return err;
 375 }
 376
 377 asmlinkage int sys_fdatasync(unsigned int fd)
 378 {
 379         struct file * file;
 380         struct dentry * dentry;
 381         struct inode * inode;
 382         int err;
 383
 384         lock_kernel();
 385         err = -EBADF;
 386         file = fget(fd);
 387         if (!file)
 388                 goto out;
 389
 390         dentry = file->f_dentry;
 391         if (!dentry)
 392                 goto out_putf;
 393
 394         inode = dentry->d_inode;
 395         if (!inode)
 396                 goto out_putf;
 397
 398         err = -EINVAL;
 399         if (!file->f_op || !file->f_op->fsync)
 400                 goto out_putf;
 401
 402         /* this needs further work, at the moment it is identical to fsync() */
 403         down(&inode->i_sem);
 404         err = file->f_op->fsync(file, dentry);
 405         up(&inode->i_sem);
 406
 407 out_putf:
 408         fput(file);
 409 out:
 410         unlock_kernel();
 411         return err;
 412 }
 413
 414 void invalidate_buffers(kdev_t dev)
 415 {
 416         int nlist;
 417
 418         spin_lock(&lru_list_lock);
 419         for(nlist = 0; nlist < NR_LIST; nlist++) {
 420                 struct buffer_head * bh;
 421                 int i;
 422         retry:
 423                 bh = lru_list[nlist];
 424                 if (!bh)
 425                         continue;
 426                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 427                         if (bh->b_dev != dev)
 428                                 continue;
 429                         if (buffer_locked(bh)) {
 430                                 atomic_inc(&bh->b_count);
 431                                 spin_unlock(&lru_list_lock);
 432                                 wait_on_buffer(bh);
 433                                 spin_lock(&lru_list_lock);
 434                                 atomic_dec(&bh->b_count);
 435                                 goto retry;
 436                         }
 437                         if (atomic_read(&bh->b_count))
 438                                 continue;
 439                         bh->b_flushtime = 0;
 440                         clear_bit(BH_Protected, &bh->b_state);
 441                         clear_bit(BH_Uptodate, &bh->b_state);
 442                         clear_bit(BH_Dirty, &bh->b_state);
 443                         clear_bit(BH_Req, &bh->b_state);
 444                 }
 445         }
 446         spin_unlock(&lru_list_lock);
 447 }
 448
 449 /* After several hours of tedious analysis, the following hash
 450  * function won.  Do not mess with it... -DaveM
 451  */
 452 #define _hashfn(dev,block)      \
 453         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 454          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 455 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 456
 457 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 458 {
 459         if ((bh->b_next = *head) != NULL)
 460                 bh->b_next->b_pprev = &bh->b_next;
 461         *head = bh;
 462         bh->b_pprev = head;
 463 }
 464
 465 static __inline__ void __hash_unlink(struct buffer_head *bh)
 466 {
 467         if (bh->b_next)
 468                 bh->b_next->b_pprev = bh->b_pprev;
 469         *(bh->b_pprev) = bh->b_next;
 470         bh->b_pprev = NULL;
 471 }
 472
 473 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 474 {
 475         struct buffer_head **bhp = &lru_list[blist];
 476
 477         if(!*bhp) {
 478                 *bhp = bh;
 479                 bh->b_prev_free = bh;
 480         }
 481         bh->b_next_free = *bhp;
 482         bh->b_prev_free = (*bhp)->b_prev_free;
 483         (*bhp)->b_prev_free->b_next_free = bh;
 484         (*bhp)->b_prev_free = bh;
 485         nr_buffers_type[blist]++;
 486 }
 487
 488 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 489 {
 490         if (bh->b_prev_free || bh->b_next_free) {
 491                 bh->b_prev_free->b_next_free = bh->b_next_free;
 492                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 493                 if (lru_list[blist] == bh)
 494                         lru_list[blist] = bh->b_next_free;
 495                 if (lru_list[blist] == bh)
 496                         lru_list[blist] = NULL;
 497                 bh->b_next_free = bh->b_prev_free = NULL;
 498                 nr_buffers_type[blist]--;
 499         }
 500 }
 501
 502 static void __remove_from_free_list(struct buffer_head * bh, int index)
 503 {
 504         if(bh->b_next_free == bh)
 505                  free_list[index].list = NULL;
 506         else {
 507                 bh->b_prev_free->b_next_free = bh->b_next_free;
 508                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 509                 if (free_list[index].list == bh)
 510                          free_list[index].list = bh->b_next_free;
 511         }
 512         bh->b_next_free = bh->b_prev_free = NULL;
 513 }
 514
 515 /* The following two functions must operate atomically
 516  * because they control the visibility of a buffer head
 517  * to the rest of the kernel.
 518  */
 519 static __inline__ void __remove_from_queues(struct buffer_head *bh)
 520 {
 521         write_lock(&hash_table_lock);
 522         if (bh->b_pprev)
 523                 __hash_unlink(bh);
 524         __remove_from_lru_list(bh, bh->b_list);
 525         write_unlock(&hash_table_lock);
 526 }
 527
 528 static void insert_into_queues(struct buffer_head *bh)
 529 {
 530         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 531
 532         spin_lock(&lru_list_lock);
 533         write_lock(&hash_table_lock);
 534         __hash_link(bh, head);
 535         __insert_into_lru_list(bh, bh->b_list);
 536         write_unlock(&hash_table_lock);
 537         spin_unlock(&lru_list_lock);
 538 }
 539
 540 /* This function must only run if there are no other
 541  * references _anywhere_ to this buffer head.
 542  */
 543 static void put_last_free(struct buffer_head * bh)
 544 {
 545         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 546         struct buffer_head **bhp = &head->list;
 547
 548         spin_lock(&head->lock);
 549         bh->b_dev = B_FREE;
 550         if(!*bhp) {
 551                 *bhp = bh;
 552                 bh->b_prev_free = bh;
 553         }
 554         bh->b_next_free = *bhp;
 555         bh->b_prev_free = (*bhp)->b_prev_free;
 556         (*bhp)->b_prev_free->b_next_free = bh;
 557         (*bhp)->b_prev_free = bh;
 558         spin_unlock(&head->lock);
 559 }
 560
 561 /*
 562  * Why like this, I hear you say... The reason is race-conditions.
 563  * As we don't lock buffers (unless we are reading them, that is),
 564  * something might happen to it while we sleep (ie a read-error
 565  * will force it bad). This shouldn't really happen currently, but
 566  * the code is ready.
 567  */
 568 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 569 {
 570         struct buffer_head **head = &hash(dev, block);
 571         struct buffer_head *bh;
 572
 573         read_lock(&hash_table_lock);
 574         for(bh = *head; bh; bh = bh->b_next)
 575                 if (bh->b_blocknr == block      &&
 576                     bh->b_size    == size       &&
 577                     bh->b_dev     == dev)
 578                         break;
 579         if (bh)
 580                 atomic_inc(&bh->b_count);
 581         read_unlock(&hash_table_lock);
 582
 583         return bh;
 584 }
 585
 586 unsigned int get_hardblocksize(kdev_t dev)
 587 {
 588         /*
 589          * Get the hard sector size for the given device.  If we don't know
 590          * what it is, return 0.
 591          */
 592         if (hardsect_size[MAJOR(dev)] != NULL) {
 593                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 594                 if (blksize != 0)
 595                         return blksize;
 596         }
 597
 598         /*
 599          * We don't know what the hardware sector size for this device is.
 600          * Return 0 indicating that we don't know.
 601          */
 602         return 0;
 603 }
 604
 605 void set_blocksize(kdev_t dev, int size)
 606 {
 607         extern int *blksize_size[];
 608         int i, nlist;
 609         struct buffer_head * bh, *bhnext;
 610
 611         if (!blksize_size[MAJOR(dev)])
 612                 return;
 613
 614         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 615         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 616                 panic("Invalid blocksize passed to set_blocksize");
 617
 618         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 619                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 620                 return;
 621         }
 622         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 623                 return;
 624         sync_buffers(dev, 2);
 625         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 626
 627         /* We need to be quite careful how we do this - we are moving entries
 628          * around on the free list, and we can get in a loop if we are not careful.
 629          */
 630         for(nlist = 0; nlist < NR_LIST; nlist++) {
 631         repeat:
 632                 spin_lock(&lru_list_lock);
 633                 bh = lru_list[nlist];
 634                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 635                         if(!bh)
 636                                 break;
 637
 638                         bhnext = bh->b_next_free;
 639                         if (bh->b_dev != dev)
 640                                  continue;
 641                         if (bh->b_size == size)
 642                                  continue;
 643                         if (buffer_locked(bh)) {
 644                                 atomic_inc(&bh->b_count);
 645                                 spin_unlock(&lru_list_lock);
 646                                 wait_on_buffer(bh);
 647                                 atomic_dec(&bh->b_count);
 648                                 goto repeat;
 649                         }
 650                         if (bh->b_dev == dev && bh->b_size != size) {
 651                                 clear_bit(BH_Dirty, &bh->b_state);
 652                                 clear_bit(BH_Uptodate, &bh->b_state);
 653                                 clear_bit(BH_Req, &bh->b_state);
 654                                 bh->b_flushtime = 0;
 655                         }
 656                         if (atomic_read(&bh->b_count) == 0) {
 657                                 __remove_from_queues(bh);
 658                                 put_last_free(bh);
 659                         }
 660                 }
 661                 spin_unlock(&lru_list_lock);
 662         }
 663 }
 664
 665 /*
 666  * We used to try various strange things. Let's not.
 667  */
 668 static void refill_freelist(int size)
 669 {
 670         if (!grow_buffers(size)) {
 671                 wakeup_bdflush(1);
 672                 current->policy |= SCHED_YIELD;
 673                 schedule();
 674         }
 675 }
 676
 677 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 678 {
 679         bh->b_list = BUF_CLEAN;
 680         bh->b_flushtime = 0;
 681         bh->b_end_io = handler;
 682         bh->b_dev_id = dev_id;
 683 }
 684
 685 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 686 {
 687         mark_buffer_uptodate(bh, uptodate);
 688         unlock_buffer(bh);
 689 }
 690
 691 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 692 {
 693         mark_buffer_uptodate(bh, uptodate);
 694         unlock_buffer(bh);
 695         BUG();
 696 }
 697
 698 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 699 {
 700         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 701         unsigned long flags;
 702         struct buffer_head *tmp;
 703         struct page *page;
 704         int free;
 705
 706         mark_buffer_uptodate(bh, uptodate);
 707
 708         /* This is a temporary buffer used for page I/O. */
 709         page = mem_map + MAP_NR(bh->b_data);
 710
 711         if (!uptodate)
 712                 SetPageError(page);
 713
 714         /*
 715          * Be _very_ careful from here on. Bad things can happen if
 716          * two buffer heads end IO at almost the same time and both
 717          * decide that the page is now completely done.
 718          *
 719          * Async buffer_heads are here only as labels for IO, and get
 720          * thrown away once the IO for this page is complete.  IO is
 721          * deemed complete once all buffers have been visited
 722          * (b_count==0) and are now unlocked. We must make sure that
 723          * only the _last_ buffer that decrements its count is the one
 724          * that free's the page..
 725          */
 726         spin_lock_irqsave(&page_uptodate_lock, flags);
 727         unlock_buffer(bh);
 728         atomic_dec(&bh->b_count);
 729         tmp = bh->b_this_page;
 730         while (tmp != bh) {
 731                 if (atomic_read(&tmp->b_count) &&
 732                     (tmp->b_end_io == end_buffer_io_async))
 733                         goto still_busy;
 734                 tmp = tmp->b_this_page;
 735         }
 736
 737         /* OK, the async IO on this page is complete. */
 738         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 739
 740         /*
 741          * if none of the buffers had errors then we can set the
 742          * page uptodate:
 743          */
 744         if (!PageError(page))
 745                 SetPageUptodate(page);
 746
 747         /*
 748          * Run the hooks that have to be done when a page I/O has completed.
 749          *
 750          * Note - we need to test the flags before we unlock the page, but
 751          * we must not actually free the page until after the unlock!
 752          */
 753         if (test_and_clear_bit(PG_decr_after, &page->flags))
 754                 atomic_dec(&nr_async_pages);
 755
 756         if (test_and_clear_bit(PG_free_swap_after, &page->flags))
 757                 swap_free(page->offset);
 758
 759         free = test_and_clear_bit(PG_free_after, &page->flags);
 760
 761         if (page->owner != (void *)-1)
 762                 PAGE_BUG(page);
 763         page->owner = current;
 764         UnlockPage(page);
 765
 766         if (free)
 767                 __free_page(page);
 768
 769         return;
 770
 771 still_busy:
 772         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 773         return;
 774 }
 775
 776
 777 /*
 778  * Ok, this is getblk, and it isn't very clear, again to hinder
 779  * race-conditions. Most of the code is seldom used, (ie repeating),
 780  * so it should be much more efficient than it looks.
 781  *
 782  * The algorithm is changed: hopefully better, and an elusive bug removed.
 783  *
 784  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 785  * when the filesystem starts to get full of dirty blocks (I hope).
 786  */
 787 struct buffer_head * getblk(kdev_t dev, int block, int size)
 788 {
 789         struct buffer_head * bh;
 790         int isize;
 791
 792 repeat:
 793         bh = get_hash_table(dev, block, size);
 794         if (bh) {
 795                 if (!buffer_dirty(bh)) {
 796                         bh->b_flushtime = 0;
 797                 }
 798                 goto out;
 799         }
 800
 801         isize = BUFSIZE_INDEX(size);
 802         spin_lock(&free_list[isize].lock);
 803         bh = free_list[isize].list;
 804         if (bh) {
 805                 __remove_from_free_list(bh, isize);
 806                 atomic_set(&bh->b_count, 1);
 807         }
 808         spin_unlock(&free_list[isize].lock);
 809         if (!bh)
 810                 goto refill;
 811
 812         /* OK, FINALLY we know that this buffer is the only one of its kind,
 813          * we hold a reference (b_count>0), it is unlocked, and it is clean.
 814          */
 815         init_buffer(bh, end_buffer_io_sync, NULL);
 816         bh->b_dev = dev;
 817         bh->b_blocknr = block;
 818         bh->b_state = 1 << BH_Mapped;
 819
 820         /* Insert the buffer into the regular lists */
 821         insert_into_queues(bh);
 822         goto out;
 823
 824         /*
 825          * If we block while refilling the free list, somebody may
 826          * create the buffer first ... search the hashes again.
 827          */
 828 refill:
 829         refill_freelist(size);
 830         goto repeat;
 831 out:
 832         return bh;
 833 }
 834
 835 /*
 836  * if a new dirty buffer is created we need to balance bdflush.
 837  *
 838  * in the future we might want to make bdflush aware of different
 839  * pressures on different devices - thus the (currently unused)
 840  * 'dev' parameter.
 841  */
 842 int too_many_dirty_buffers;
 843
 844 void balance_dirty(kdev_t dev)
 845 {
 846         int dirty = nr_buffers_type[BUF_DIRTY];
 847         int ndirty = bdf_prm.b_un.ndirty;
 848
 849         if (dirty > ndirty) {
 850                 if (dirty > 2*ndirty) {
 851                         too_many_dirty_buffers = 1;
 852                         wakeup_bdflush(1);
 853                         return;
 854                 }
 855                 wakeup_bdflush(0);
 856         }
 857         too_many_dirty_buffers = 0;
 858         return;
 859 }
 860
 861 static inline void __mark_dirty(struct buffer_head *bh, int flag)
 862 {
 863         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 864         clear_bit(BH_New, &bh->b_state);
 865         refile_buffer(bh);
 866 }
 867
 868 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 869 {
 870         __mark_dirty(bh, flag);
 871 }
 872
 873 /*
 874  * A buffer may need to be moved from one buffer list to another
 875  * (e.g. in case it is not shared any more). Handle this.
 876  */
 877 static __inline__ void __refile_buffer(struct buffer_head *bh)
 878 {
 879         int dispose = BUF_CLEAN;
 880         if (buffer_locked(bh))
 881                 dispose = BUF_LOCKED;
 882         if (buffer_dirty(bh))
 883                 dispose = BUF_DIRTY;
 884         if (dispose != bh->b_list) {
 885                 __remove_from_lru_list(bh, bh->b_list);
 886                 bh->b_list = dispose;
 887                 __insert_into_lru_list(bh, dispose);
 888         }
 889 }
 890
 891 void refile_buffer(struct buffer_head *bh)
 892 {
 893         spin_lock(&lru_list_lock);
 894         __refile_buffer(bh);
 895         spin_unlock(&lru_list_lock);
 896 }
 897
 898 /*
 899  * Release a buffer head
 900  */
 901 void __brelse(struct buffer_head * buf)
 902 {
 903         touch_buffer(buf);
 904
 905         if (atomic_read(&buf->b_count)) {
 906                 atomic_dec(&buf->b_count);
 907                 return;
 908         }
 909         printk("VFS: brelse: Trying to free free buffer\n");
 910 }
 911
 912 /*
 913  * bforget() is like brelse(), except it puts the buffer on the
 914  * free list if it can.. We can NOT free the buffer if:
 915  *  - there are other users of it
 916  *  - it is locked and thus can have active IO
 917  */
 918 void __bforget(struct buffer_head * buf)
 919 {
 920         spin_lock(&lru_list_lock);
 921         write_lock(&hash_table_lock);
 922         if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
 923                 touch_buffer(buf);
 924                 atomic_dec(&buf->b_count);
 925         } else {
 926                 atomic_set(&buf->b_count, 0);
 927                 buf->b_state = 0;
 928                 if (buf->b_pprev)
 929                         __hash_unlink(buf);
 930                 __remove_from_lru_list(buf, buf->b_list);
 931                 put_last_free(buf);
 932         }
 933         write_unlock(&hash_table_lock);
 934         spin_unlock(&lru_list_lock);
 935 }
 936
 937 /*
 938  * bread() reads a specified block and returns the buffer that contains
 939  * it. It returns NULL if the block was unreadable.
 940  */
 941 struct buffer_head * bread(kdev_t dev, int block, int size)
 942 {
 943         struct buffer_head * bh;
 944
 945         bh = getblk(dev, block, size);
 946         if (buffer_uptodate(bh))
 947                 return bh;
 948         ll_rw_block(READ, 1, &bh);
 949         wait_on_buffer(bh);
 950         if (buffer_uptodate(bh))
 951                 return bh;
 952         brelse(bh);
 953         return NULL;
 954 }
 955
 956 /*
 957  * Ok, breada can be used as bread, but additionally to mark other
 958  * blocks for reading as well. End the argument list with a negative
 959  * number.
 960  */
 961
 962 #define NBUF 16
 963
 964 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 965         unsigned int pos, unsigned int filesize)
 966 {
 967         struct buffer_head * bhlist[NBUF];
 968         unsigned int blocks;
 969         struct buffer_head * bh;
 970         int index;
 971         int i, j;
 972
 973         if (pos >= filesize)
 974                 return NULL;
 975
 976         if (block < 0)
 977                 return NULL;
 978
 979         bh = getblk(dev, block, bufsize);
 980         index = BUFSIZE_INDEX(bh->b_size);
 981
 982         if (buffer_uptodate(bh))
 983                 return(bh);
 984         else ll_rw_block(READ, 1, &bh);
 985
 986         blocks = (filesize - pos) >> (9+index);
 987
 988         if (blocks < (read_ahead[MAJOR(dev)] >> index))
 989                 blocks = read_ahead[MAJOR(dev)] >> index;
 990         if (blocks > NBUF)
 991                 blocks = NBUF;
 992
 993 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 994
 995         bhlist[0] = bh;
 996         j = 1;
 997         for(i=1; i<blocks; i++) {
 998                 bh = getblk(dev,block+i,bufsize);
 999                 if (buffer_uptodate(bh)) {
1000                         brelse(bh);
1001                         break;
1002                 }
1003                 else bhlist[j++] = bh;
1004         }
1005
1006         /* Request the read for these buffers, and then release them. */
1007         if (j>1)
1008                 ll_rw_block(READA, (j-1), bhlist+1);
1009         for(i=1; i<j; i++)
1010                 brelse(bhlist[i]);
1011
1012         /* Wait for this buffer, and then continue on. */
1013         bh = bhlist[0];
1014         wait_on_buffer(bh);
1015         if (buffer_uptodate(bh))
1016                 return bh;
1017         brelse(bh);
1018         return NULL;
1019 }
1020
1021 /*
1022  * Note: the caller should wake up the buffer_wait list if needed.
1023  */
1024 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1025 {
1026         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1027                 kmem_cache_free(bh_cachep, bh);
1028         } else {
1029                 bh->b_blocknr = -1;
1030                 init_waitqueue_head(&bh->b_wait);
1031                 nr_unused_buffer_heads++;
1032                 bh->b_next_free = unused_list;
1033                 bh->b_this_page = NULL;
1034                 unused_list = bh;
1035         }
1036 }
1037
1038 static void put_unused_buffer_head(struct buffer_head *bh)
1039 {
1040         spin_lock(&unused_list_lock);
1041         __put_unused_buffer_head(bh);
1042         spin_unlock(&unused_list_lock);
1043 }
1044
1045 /*
1046  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1047  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1048  * buffer heads is now handled in create_buffers().
1049  */
1050 static struct buffer_head * get_unused_buffer_head(int async)
1051 {
1052         struct buffer_head * bh;
1053
1054         spin_lock(&unused_list_lock);
1055         if (nr_unused_buffer_heads > NR_RESERVED) {
1056                 bh = unused_list;
1057                 unused_list = bh->b_next_free;
1058                 nr_unused_buffer_heads--;
1059                 spin_unlock(&unused_list_lock);
1060                 return bh;
1061         }
1062         spin_unlock(&unused_list_lock);
1063
1064         /* This is critical.  We can't swap out pages to get
1065          * more buffer heads, because the swap-out may need
1066          * more buffer-heads itself.  Thus SLAB_BUFFER.
1067          */
1068         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1069                 memset(bh, 0, sizeof(*bh));
1070                 init_waitqueue_head(&bh->b_wait);
1071                 return bh;
1072         }
1073
1074         /*
1075          * If we need an async buffer, use the reserved buffer heads.
1076          */
1077         if (async) {
1078                 spin_lock(&unused_list_lock);
1079                 if (unused_list) {
1080                         bh = unused_list;
1081                         unused_list = bh->b_next_free;
1082                         nr_unused_buffer_heads--;
1083                         spin_unlock(&unused_list_lock);
1084                         return bh;
1085                 }
1086                 spin_unlock(&unused_list_lock);
1087         }
1088 #if 0
1089         /*
1090          * (Pending further analysis ...)
1091          * Ordinary (non-async) requests can use a different memory priority
1092          * to free up pages. Any swapping thus generated will use async
1093          * buffer heads.
1094          */
1095         if(!async &&
1096            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1097                 memset(bh, 0, sizeof(*bh));
1098                 init_waitqueue_head(&bh->b_wait);
1099                 return bh;
1100         }
1101 #endif
1102
1103         return NULL;
1104 }
1105
1106 /*
1107  * Create the appropriate buffers when given a page for data area and
1108  * the size of each buffer.. Use the bh->b_this_page linked list to
1109  * follow the buffers created.  Return NULL if unable to create more
1110  * buffers.
1111  * The async flag is used to differentiate async IO (paging, swapping)
1112  * from ordinary buffer allocations, and only async requests are allowed
1113  * to sleep waiting for buffer heads.
1114  */
1115 static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
1116 {
1117         DECLARE_WAITQUEUE(wait, current);
1118         struct buffer_head *bh, *head;
1119         long offset;
1120
1121 try_again:
1122         head = NULL;
1123         offset = PAGE_SIZE;
1124         while ((offset -= size) >= 0) {
1125                 bh = get_unused_buffer_head(async);
1126                 if (!bh)
1127                         goto no_grow;
1128
1129                 bh->b_dev = B_FREE;  /* Flag as unused */
1130                 bh->b_this_page = head;
1131                 head = bh;
1132
1133                 bh->b_state = 0;
1134                 bh->b_next_free = NULL;
1135                 bh->b_pprev = NULL;
1136                 atomic_set(&bh->b_count, 0);
1137                 bh->b_size = size;
1138
1139                 bh->b_data = (char *) (page+offset);
1140                 bh->b_list = BUF_CLEAN;
1141                 bh->b_flushtime = 0;
1142                 bh->b_end_io = end_buffer_io_bad;
1143         }
1144         return head;
1145 /*
1146  * In case anything failed, we just free everything we got.
1147  */
1148 no_grow:
1149         if (head) {
1150                 do {
1151                         bh = head;
1152                         head = head->b_this_page;
1153                         put_unused_buffer_head(bh);
1154                 } while (head);
1155
1156                 /* Wake up any waiters ... */
1157                 wake_up(&buffer_wait);
1158         }
1159
1160         /*
1161          * Return failure for non-async IO requests.  Async IO requests
1162          * are not allowed to fail, so we have to wait until buffer heads
1163          * become available.  But we don't want tasks sleeping with
1164          * partially complete buffers, so all were released above.
1165          */
1166         if (!async)
1167                 return NULL;
1168
1169         /* We're _really_ low on memory. Now we just
1170          * wait for old buffer heads to become free due to
1171          * finishing IO.  Since this is an async request and
1172          * the reserve list is empty, we're sure there are
1173          * async buffer heads in use.
1174          */
1175         run_task_queue(&tq_disk);
1176
1177         /*
1178          * Set our state for sleeping, then check again for buffer heads.
1179          * This ensures we won't miss a wake_up from an interrupt.
1180          */
1181         add_wait_queue(&buffer_wait, &wait);
1182         current->state = TASK_UNINTERRUPTIBLE;
1183         if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1184                 current->policy |= SCHED_YIELD;
1185                 schedule();
1186         }
1187         remove_wait_queue(&buffer_wait, &wait);
1188         current->state = TASK_RUNNING;
1189         goto try_again;
1190 }
1191
1192 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1193 {
1194         struct buffer_head *head, *bh, *tail;
1195         int block;
1196
1197         if (!PageLocked(page))
1198                 BUG();
1199         if (page->owner != current)
1200                 PAGE_BUG(page);
1201         /*
1202          * Allocate async buffer heads pointing to this page, just for I/O.
1203          * They show up in the buffer hash table and are registered in
1204          * page->buffers.
1205          */
1206         head = create_buffers(page_address(page), size, 1);
1207         if (page->buffers)
1208                 BUG();
1209         if (!head)
1210                 BUG();
1211         tail = head;
1212         for (bh = head; bh; bh = bh->b_this_page) {
1213                 block = *(b++);
1214
1215                 tail = bh;
1216                 init_buffer(bh, end_buffer_io_async, NULL);
1217                 bh->b_dev = dev;
1218                 bh->b_blocknr = block;
1219
1220                 /*
1221                  * When we use bmap, we define block zero to represent
1222                  * a hole.  ll_rw_page, however, may legitimately
1223                  * access block zero, and we need to distinguish the
1224                  * two cases.
1225                  */
1226                 if (bmap && !block) {
1227                         memset(bh->b_data, 0, size);
1228                         set_bit(BH_Uptodate, &bh->b_state);
1229                         continue;
1230                 }
1231                 set_bit(BH_Mapped, &bh->b_state);
1232         }
1233         tail->b_this_page = head;
1234         get_page(page);
1235         page->buffers = head;
1236         return 0;
1237 }
1238
1239 /*
1240  * We don't have to release all buffers here, but
1241  * we have to be sure that no dirty buffer is left
1242  * and no IO is going on (no buffer is locked), because
1243  * we have truncated the file and are going to free the
1244  * blocks on-disk..
1245  */
1246 int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1247 {
1248         struct buffer_head *head, *bh, *next;
1249         unsigned int curr_off = 0;
1250
1251         if (!PageLocked(page))
1252                 BUG();
1253         if (!page->buffers)
1254                 return 0;
1255
1256         head = page->buffers;
1257         bh = head;
1258         do {
1259                 unsigned int next_off = curr_off + bh->b_size;
1260                 next = bh->b_this_page;
1261
1262                 /*
1263                  * is this block fully flushed?
1264                  */
1265                 if (offset <= curr_off) {
1266                         if (buffer_mapped(bh)) {
1267                                 atomic_inc(&bh->b_count);
1268                                 wait_on_buffer(bh);
1269                                 if (bh->b_dev == B_FREE)
1270                                         BUG();
1271                                 mark_buffer_clean(bh);
1272                                 clear_bit(BH_Uptodate, &bh->b_state);
1273                                 clear_bit(BH_Mapped, &bh->b_state);
1274                                 clear_bit(BH_Req, &bh->b_state);
1275                                 bh->b_blocknr = 0;
1276                                 atomic_dec(&bh->b_count);
1277                         }
1278                 }
1279                 curr_off = next_off;
1280                 bh = next;
1281         } while (bh != head);
1282
1283         /*
1284          * subtle. We release buffer-heads only if this is
1285          * the 'final' flushpage. We have invalidated the bmap
1286          * cached value unconditionally, so real IO is not
1287          * possible anymore.
1288          *
1289          * If the free doesn't work out, the buffers can be
1290          * left around - they just turn into anonymous buffers
1291          * instead.
1292          */
1293         if (!offset) {
1294                 if (!try_to_free_buffers(page))
1295                         atomic_add(PAGE_CACHE_SIZE, &buffermem);
1296         }
1297
1298         return 0;
1299 }
1300
1301 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1302 {
1303         struct buffer_head *bh, *head, *tail;
1304
1305         head = create_buffers(page_address(page), blocksize, 1);
1306         if (page->buffers)
1307                 BUG();
1308
1309         bh = head;
1310         do {
1311                 bh->b_dev = inode->i_dev;
1312                 bh->b_blocknr = 0;
1313                 bh->b_end_io = end_buffer_io_bad;
1314                 tail = bh;
1315                 bh = bh->b_this_page;
1316         } while (bh);
1317         tail->b_this_page = head;
1318         page->buffers = head;
1319         get_page(page);
1320 }
1321
1322 /*
1323  * block_write_full_page() is SMP-safe - currently it's still
1324  * being called with the kernel lock held, but the code is ready.
1325  */
1326 int block_write_full_page(struct file *file, struct page *page)
1327 {
1328         struct dentry *dentry = file->f_dentry;
1329         struct inode *inode = dentry->d_inode;
1330         int err, i;
1331         unsigned long block, offset;
1332         struct buffer_head *bh, *head;
1333
1334         if (!PageLocked(page))
1335                 BUG();
1336
1337         if (!page->buffers)
1338                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1339         head = page->buffers;
1340
1341         offset = page->offset;
1342         block = offset >> inode->i_sb->s_blocksize_bits;
1343
1344         // FIXME: currently we assume page alignment.
1345         if (offset & (PAGE_SIZE-1))
1346                 BUG();
1347
1348         bh = head;
1349         i = 0;
1350         do {
1351                 if (!bh)
1352                         BUG();
1353
1354                 /*
1355                  * If the buffer isn't up-to-date, we can't be sure
1356                  * that the buffer has been initialized with the proper
1357                  * block number information etc..
1358                  *
1359                  * Leave it to the low-level FS to make all those
1360                  * decisions (block #0 may actually be a valid block)
1361                  */
1362                 bh->b_end_io = end_buffer_io_sync;
1363                 if (!buffer_mapped(bh)) {
1364                         err = inode->i_op->get_block(inode, block, bh, 1);
1365                         if (err)
1366                                 goto out;
1367                 }
1368                 set_bit(BH_Uptodate, &bh->b_state);
1369                 mark_buffer_dirty(bh,0);
1370
1371                 bh = bh->b_this_page;
1372                 block++;
1373         } while (bh != head);
1374
1375         SetPageUptodate(page);
1376         return 0;
1377 out:
1378         ClearPageUptodate(page);
1379         return err;
1380 }
1381
1382 int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
1383 {
1384         struct dentry *dentry = file->f_dentry;
1385         struct inode *inode = dentry->d_inode;
1386         unsigned long block;
1387         int err, partial;
1388         unsigned long blocksize, start_block, end_block;
1389         unsigned long start_offset, start_bytes, end_bytes;
1390         unsigned long bbits, blocks, i, len;
1391         struct buffer_head *bh, *head;
1392         char * target_buf;
1393
1394         target_buf = (char *)page_address(page) + offset;
1395
1396         if (!PageLocked(page))
1397                 BUG();
1398
1399         blocksize = inode->i_sb->s_blocksize;
1400         if (!page->buffers)
1401                 create_empty_buffers(page, inode, blocksize);
1402         head = page->buffers;
1403
1404         bbits = inode->i_sb->s_blocksize_bits;
1405         block = page->offset >> bbits;
1406         blocks = PAGE_SIZE >> bbits;
1407         start_block = offset >> bbits;
1408         end_block = (offset + bytes - 1) >> bbits;
1409         start_offset = offset & (blocksize - 1);
1410         start_bytes = blocksize - start_offset;
1411         if (start_bytes > bytes)
1412                 start_bytes = bytes;
1413         end_bytes = (offset+bytes) & (blocksize - 1);
1414         if (end_bytes > bytes)
1415                 end_bytes = bytes;
1416
1417         if (offset < 0 || offset >= PAGE_SIZE)
1418                 BUG();
1419         if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1420                 BUG();
1421         if (start_block < 0 || start_block >= blocks)
1422                 BUG();
1423         if (end_block < 0 || end_block >= blocks)
1424                 BUG();
1425         // FIXME: currently we assume page alignment.
1426         if (page->offset & (PAGE_SIZE-1))
1427                 BUG();
1428
1429         i = 0;
1430         bh = head;
1431         partial = 0;
1432         do {
1433                 if (!bh)
1434                         BUG();
1435
1436                 if ((i < start_block) || (i > end_block)) {
1437                         if (!buffer_uptodate(bh))
1438                                 partial = 1;
1439                         goto skip;
1440                 }
1441
1442                 /*
1443                  * If the buffer is not up-to-date, we need to ask the low-level
1444                  * FS to do something for us (we used to have assumptions about
1445                  * the meaning of b_blocknr etc, that's bad).
1446                  *
1447                  * If "update" is set, that means that the low-level FS should
1448                  * try to make sure that the block is up-to-date because we're
1449                  * not going to fill it completely.
1450                  */
1451                 bh->b_end_io = end_buffer_io_sync;
1452                 if (!buffer_mapped(bh)) {
1453                         err = inode->i_op->get_block(inode, block, bh, 1);
1454                         if (err)
1455                                 goto out;
1456                 }
1457
1458                 if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1459                         if (buffer_new(bh)) {
1460                                 memset(bh->b_data, 0, bh->b_size);
1461                         } else {
1462                                 ll_rw_block(READ, 1, &bh);
1463                                 wait_on_buffer(bh);
1464                                 err = -EIO;
1465                                 if (!buffer_uptodate(bh))
1466                                         goto out;
1467                         }
1468                 }
1469
1470                 len = blocksize;
1471                 if (start_offset) {
1472                         len = start_bytes;
1473                         start_offset = 0;
1474                 } else if (end_bytes && (i == end_block)) {
1475                         len = end_bytes;
1476                         end_bytes = 0;
1477                 }
1478                 err = copy_from_user(target_buf, buf, len);
1479                 target_buf += len;
1480                 buf += len;
1481
1482                 /*
1483                  * we dirty buffers only after copying the data into
1484                  * the page - this way we can dirty the buffer even if
1485                  * the bh is still doing IO.
1486                  *
1487                  * NOTE! This also does a direct dirty balace check,
1488                  * rather than relying on bdflush just waking up every
1489                  * once in a while. This is to catch (and slow down)
1490                  * the processes that write tons of buffer..
1491                  *
1492                  * Note how we do NOT want to do this in the full block
1493                  * case: full pages are flushed not by the people who
1494                  * dirtied them, but by people who need memory. And we
1495                  * should not penalize them for somebody else writing
1496                  * lots of dirty pages.
1497                  */
1498                 set_bit(BH_Uptodate, &bh->b_state);
1499                 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1500                         __mark_dirty(bh, 0);
1501                         if (too_many_dirty_buffers)
1502                                 balance_dirty(bh->b_dev);
1503                 }
1504
1505                 if (err) {
1506                         err = -EFAULT;
1507                         goto out;
1508                 }
1509
1510 skip:
1511                 i++;
1512                 block++;
1513                 bh = bh->b_this_page;
1514         } while (bh != head);
1515
1516         /*
1517          * is this a partial write that happened to make all buffers
1518          * uptodate then we can optimize away a bogus readpage() for
1519          * the next read(). Here we 'discover' wether the page went
1520          * uptodate as a result of this (potentially partial) write.
1521          */
1522         if (!partial)
1523                 SetPageUptodate(page);
1524         return bytes;
1525 out:
1526         ClearPageUptodate(page);
1527         return err;
1528 }
1529
1530 /*
1531  * Start I/O on a page.
1532  * This function expects the page to be locked and may return
1533  * before I/O is complete. You then have to check page->locked,
1534  * page->uptodate, and maybe wait on page->wait.
1535  *
1536  * brw_page() is SMP-safe, although it's being called with the
1537  * kernel lock held - but the code is ready.
1538  *
1539  * FIXME: we need a swapper_inode->get_block function to remove
1540  *        some of the bmap kludges and interface ugliness here.
1541  */
1542 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1543 {
1544         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1545         int nr, fresh /* temporary debugging flag */, block;
1546
1547         if (!PageLocked(page))
1548                 panic("brw_page: page not locked for I/O");
1549 //      clear_bit(PG_error, &page->flags);
1550         /*
1551          * We pretty much rely on the page lock for this, because
1552          * create_page_buffers() might sleep.
1553          */
1554         fresh = 0;
1555         if (!page->buffers) {
1556                 create_page_buffers(rw, page, dev, b, size, bmap);
1557                 fresh = 1;
1558         }
1559         if (!page->buffers)
1560                 BUG();
1561         page->owner = (void *)-1;
1562
1563         head = page->buffers;
1564         bh = head;
1565         nr = 0;
1566         do {
1567                 block = *(b++);
1568
1569                 if (fresh && (atomic_read(&bh->b_count) != 0))
1570                         BUG();
1571                 if (rw == READ) {
1572                         if (!fresh)
1573                                 BUG();
1574                         if (bmap && !block) {
1575                                 if (block)
1576                                         BUG();
1577                         } else {
1578                                 if (bmap && !block)
1579                                         BUG();
1580                                 if (!buffer_uptodate(bh)) {
1581                                         arr[nr++] = bh;
1582                                         atomic_inc(&bh->b_count);
1583                                 }
1584                         }
1585                 } else { /* WRITE */
1586                         if (!bh->b_blocknr) {
1587                                 if (!block)
1588                                         BUG();
1589                                 bh->b_blocknr = block;
1590                         } else {
1591                                 if (!block)
1592                                         BUG();
1593                         }
1594                         set_bit(BH_Uptodate, &bh->b_state);
1595                         set_bit(BH_Dirty, &bh->b_state);
1596                         arr[nr++] = bh;
1597                         atomic_inc(&bh->b_count);
1598                 }
1599                 bh = bh->b_this_page;
1600         } while (bh != head);
1601         if (rw == READ)
1602                 ++current->maj_flt;
1603         if ((rw == READ) && nr) {
1604                 if (Page_Uptodate(page))
1605                         BUG();
1606                 ll_rw_block(rw, nr, arr);
1607         } else {
1608                 if (!nr && rw == READ) {
1609                         SetPageUptodate(page);
1610                         page->owner = current;
1611                         UnlockPage(page);
1612                 }
1613                 if (nr && (rw == WRITE))
1614                         ll_rw_block(rw, nr, arr);
1615         }
1616         return 0;
1617 }
1618
1619 /*
1620  * Generic "read page" function for block devices that have the normal
1621  * bmap functionality. This is most of the block device filesystems.
1622  * Reads the page asynchronously --- the unlock_buffer() and
1623  * mark_buffer_uptodate() functions propagate buffer state into the
1624  * page struct once IO has completed.
1625  */
1626 int block_read_full_page(struct file * file, struct page * page)
1627 {
1628         struct dentry *dentry = file->f_dentry;
1629         struct inode *inode = dentry->d_inode;
1630         unsigned long iblock;
1631         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1632         unsigned int blocksize, blocks;
1633         int nr;
1634
1635         if (!PageLocked(page))
1636                 PAGE_BUG(page);
1637         blocksize = inode->i_sb->s_blocksize;
1638         if (!page->buffers)
1639                 create_empty_buffers(page, inode, blocksize);
1640         head = page->buffers;
1641
1642         blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1643         iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1644         page->owner = (void *)-1;
1645         head = page->buffers;
1646         bh = head;
1647         nr = 0;
1648
1649         do {
1650                 if (buffer_uptodate(bh))
1651                         continue;
1652
1653                 if (!buffer_mapped(bh)) {
1654                         inode->i_op->get_block(inode, iblock, bh, 0);
1655                         if (!buffer_mapped(bh)) {
1656                                 memset(bh->b_data, 0, blocksize);
1657                                 set_bit(BH_Uptodate, &bh->b_state);
1658                                 continue;
1659                         }
1660                 }
1661
1662                 init_buffer(bh, end_buffer_io_async, NULL);
1663                 atomic_inc(&bh->b_count);
1664                 arr[nr] = bh;
1665                 nr++;
1666         } while (iblock++, (bh = bh->b_this_page) != head);
1667
1668         ++current->maj_flt;
1669         if (nr) {
1670                 if (Page_Uptodate(page))
1671                         BUG();
1672                 ll_rw_block(READ, nr, arr);
1673         } else {
1674                 /*
1675                  * all buffers are uptodate - we can set the page
1676                  * uptodate as well.
1677                  */
1678                 SetPageUptodate(page);
1679                 page->owner = current;
1680                 UnlockPage(page);
1681         }
1682         return 0;
1683 }
1684
1685 /*
1686  * Try to increase the number of buffers available: the size argument
1687  * is used to determine what kind of buffers we want.
1688  */
1689 static int grow_buffers(int size)
1690 {
1691         unsigned long page;
1692         struct buffer_head *bh, *tmp;
1693         struct buffer_head * insert_point;
1694         int isize;
1695
1696         if ((size & 511) || (size > PAGE_SIZE)) {
1697                 printk("VFS: grow_buffers: size = %d\n",size);
1698                 return 0;
1699         }
1700
1701         if (!(page = __get_free_page(GFP_BUFFER)))
1702                 return 0;
1703         bh = create_buffers(page, size, 0);
1704         if (!bh) {
1705                 free_page(page);
1706                 return 0;
1707         }
1708
1709         isize = BUFSIZE_INDEX(size);
1710
1711         spin_lock(&free_list[isize].lock);
1712         insert_point = free_list[isize].list;
1713         tmp = bh;
1714         while (1) {
1715                 if (insert_point) {
1716                         tmp->b_next_free = insert_point->b_next_free;
1717                         tmp->b_prev_free = insert_point;
1718                         insert_point->b_next_free->b_prev_free = tmp;
1719                         insert_point->b_next_free = tmp;
1720                 } else {
1721                         tmp->b_prev_free = tmp;
1722                         tmp->b_next_free = tmp;
1723                 }
1724                 insert_point = tmp;
1725                 if (tmp->b_this_page)
1726                         tmp = tmp->b_this_page;
1727                 else
1728                         break;
1729         }
1730         tmp->b_this_page = bh;
1731         free_list[isize].list = bh;
1732         spin_unlock(&free_list[isize].lock);
1733
1734         mem_map[MAP_NR(page)].buffers = bh;
1735         atomic_add(PAGE_SIZE, &buffermem);
1736         return 1;
1737 }
1738
1739 /*
1740  * Can the buffer be thrown out?
1741  */
1742 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1743 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1744
1745 /*
1746  * try_to_free_buffers() checks if all the buffers on this particular page
1747  * are unused, and free's the page if so.
1748  *
1749  * Wake up bdflush() if this fails - if we're running low on memory due
1750  * to dirty buffers, we need to flush them out as quickly as possible.
1751  *
1752  * NOTE: There are quite a number of ways that threads of control can
1753  *       obtain a reference to a buffer head within a page.  So we must
1754  *       lock out all of these paths to cleanly toss the page.
1755  */
1756 int try_to_free_buffers(struct page * page)
1757 {
1758         struct buffer_head * tmp, * bh = page->buffers;
1759         int index = BUFSIZE_INDEX(bh->b_size);
1760         int ret;
1761
1762         spin_lock(&lru_list_lock);
1763         write_lock(&hash_table_lock);
1764         spin_lock(&free_list[index].lock);
1765         tmp = bh;
1766         do {
1767                 struct buffer_head * p = tmp;
1768
1769                 tmp = tmp->b_this_page;
1770                 if (buffer_busy(p))
1771                         goto busy_buffer_page;
1772         } while (tmp != bh);
1773
1774         spin_lock(&unused_list_lock);
1775         tmp = bh;
1776         do {
1777                 struct buffer_head * p = tmp;
1778                 tmp = tmp->b_this_page;
1779
1780                 /* The buffer can be either on the regular
1781                  * queues or on the free list..
1782                  */
1783                 if (p->b_dev == B_FREE) {
1784                         __remove_from_free_list(p, index);
1785                 } else {
1786                         if (p->b_pprev)
1787                                 __hash_unlink(p);
1788                         __remove_from_lru_list(p, p->b_list);
1789                 }
1790                 __put_unused_buffer_head(p);
1791         } while (tmp != bh);
1792         spin_unlock(&unused_list_lock);
1793
1794         /* Wake up anyone waiting for buffer heads */
1795         wake_up(&buffer_wait);
1796
1797         /* And free the page */
1798         page->buffers = NULL;
1799         __free_page(page);
1800         ret = 1;
1801 out:
1802         spin_unlock(&free_list[index].lock);
1803         write_unlock(&hash_table_lock);
1804         spin_unlock(&lru_list_lock);
1805         return ret;
1806
1807 busy_buffer_page:
1808         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
1809         too_many_dirty_buffers = 1;
1810         wakeup_bdflush(0);
1811         ret = 0;
1812         goto out;
1813 }
1814
1815 /* ===================== Init ======================= */
1816
1817 /*
1818  * allocate the hash table and init the free list
1819  * Use gfp() for the hash table to decrease TLB misses, use
1820  * SLAB cache for buffer heads.
1821  */
1822 void __init buffer_init(unsigned long memory_size)
1823 {
1824         int order, i;
1825         unsigned int nr_hash;
1826
1827         /* The buffer cache hash table is less important these days,
1828          * trim it a bit.
1829          */
1830         memory_size >>= 14;
1831         memory_size *= sizeof(struct buffer_head *);
1832         for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
1833                 ;
1834
1835         /* try to allocate something until we get it or we're asking
1836            for something that is really too small */
1837
1838         do {
1839                 unsigned long tmp;
1840
1841                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
1842                 bh_hash_mask = (nr_hash - 1);
1843
1844                 tmp = nr_hash;
1845                 bh_hash_shift = 0;
1846                 while((tmp >>= 1UL) != 0UL)
1847                         bh_hash_shift++;
1848
1849                 hash_table = (struct buffer_head **)
1850                     __get_free_pages(GFP_ATOMIC, order);
1851         } while (hash_table == NULL && --order > 0);
1852         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
1853                nr_hash, order, (1UL<<order) * PAGE_SIZE);
1854
1855         if (!hash_table)
1856                 panic("Failed to allocate buffer hash table\n");
1857
1858         /* Setup hash chains. */
1859         for(i = 0; i < nr_hash; i++)
1860                 hash_table[i] = NULL;
1861
1862         /* Setup free lists. */
1863         for(i = 0; i < NR_SIZES; i++) {
1864                 free_list[i].list = NULL;
1865                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
1866         }
1867
1868         /* Setup lru lists. */
1869         for(i = 0; i < NR_LIST; i++)
1870                 lru_list[i] = NULL;
1871
1872         bh_cachep = kmem_cache_create("buffer_head",
1873                                       sizeof(struct buffer_head),
1874                                       0,
1875                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1876         if(!bh_cachep)
1877                 panic("Cannot create buffer head SLAB cache\n");
1878 }
1879
1880
1881 /* ====================== bdflush support =================== */
1882
1883 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1884  * response to dirty buffers.  Once this process is activated, we write back
1885  * a limited number of buffers to the disks and then go back to sleep again.
1886  */
1887 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1888 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1889 struct task_struct *bdflush_tsk = 0;
1890
1891 void wakeup_bdflush(int wait)
1892 {
1893         if (current == bdflush_tsk)
1894                 return;
1895         if (wait)
1896                 run_task_queue(&tq_disk);
1897         wake_up(&bdflush_wait);
1898         if (wait)
1899                 sleep_on(&bdflush_done);
1900 }
1901
1902
1903 /*
1904  * Here we attempt to write back old buffers.  We also try to flush inodes
1905  * and supers as well, since this function is essentially "update", and
1906  * otherwise there would be no way of ensuring that these quantities ever
1907  * get written back.  Ideally, we would have a timestamp on the inodes
1908  * and superblocks so that we could write back only the old ones as well
1909  */
1910
1911 static int sync_old_buffers(void)
1912 {
1913         int nlist;
1914
1915         lock_kernel();
1916         sync_supers(0);
1917         sync_inodes(0);
1918         unlock_kernel();
1919
1920         for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
1921                 struct buffer_head *bh;
1922         repeat:
1923                 spin_lock(&lru_list_lock);
1924                 bh = lru_list[nlist];
1925                 if(bh) {
1926                         struct buffer_head *next;
1927                         int i;
1928                         for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1929                                 next = bh->b_next_free;
1930
1931                                 /* If the buffer is not on the proper list,
1932                                  * then refile it.
1933                                  */
1934                                 if ((nlist == BUF_DIRTY &&
1935                                      (!buffer_dirty(bh) && !buffer_locked(bh))) ||
1936                                     (nlist == BUF_LOCKED && !buffer_locked(bh))) {
1937                                         __refile_buffer(bh);
1938                                         continue;
1939                                 }
1940
1941                                 if (buffer_locked(bh) || !buffer_dirty(bh))
1942                                         continue;
1943
1944                                 /* OK, now we are committed to write it out. */
1945                                 bh->b_flushtime = 0;
1946                                 atomic_inc(&bh->b_count);
1947                                 spin_unlock(&lru_list_lock);
1948                                 ll_rw_block(WRITE, 1, &bh);
1949                                 atomic_dec(&bh->b_count);
1950                                 goto repeat;
1951                         }
1952                 }
1953                 spin_unlock(&lru_list_lock);
1954         }
1955         run_task_queue(&tq_disk);
1956         return 0;
1957 }
1958
1959 struct mm_struct * start_lazy_tlb(void)
1960 {
1961         struct mm_struct *mm = current->mm;
1962         atomic_inc(&mm->mm_count);
1963         current->mm = NULL;
1964         /* active_mm is still 'mm' */
1965         return mm;
1966 }
1967
1968 void end_lazy_tlb(struct mm_struct *mm)
1969 {
1970         struct mm_struct *active_mm = current->active_mm;
1971
1972         current->mm = mm;
1973         if (mm != active_mm) {
1974                 current->active_mm = mm;
1975                 activate_context();
1976         }
1977         mmdrop(active_mm);
1978 }
1979
1980 /* This is the interface to bdflush.  As we get more sophisticated, we can
1981  * pass tuning parameters to this "process", to adjust how it behaves.
1982  * We would want to verify each parameter, however, to make sure that it
1983  * is reasonable. */
1984
1985 asmlinkage int sys_bdflush(int func, long data)
1986 {
1987         if (!capable(CAP_SYS_ADMIN))
1988                 return -EPERM;
1989
1990         if (func == 1) {
1991                 int error;
1992                 struct mm_struct *user_mm;
1993
1994                 /*
1995                  * bdflush will spend all of it's time in kernel-space,
1996                  * without touching user-space, so we can switch it into
1997                  * 'lazy TLB mode' to reduce the cost of context-switches
1998                  * to and from bdflush.
1999                  */
2000                 user_mm = start_lazy_tlb();
2001                 error = sync_old_buffers();
2002                 end_lazy_tlb(user_mm);
2003                 return error;
2004         }
2005
2006         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2007         if (func >= 2) {
2008                 int i = (func-2) >> 1;
2009                 if (i >= 0 && i < N_PARAM) {
2010                         if ((func & 1) == 0)
2011                                 return put_user(bdf_prm.data[i], (int*)data);
2012
2013                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2014                                 bdf_prm.data[i] = data;
2015                                 return 0;
2016                         }
2017                 }
2018                 return -EINVAL;
2019         }
2020
2021         /* Having func 0 used to launch the actual bdflush and then never
2022          * return (unless explicitly killed). We return zero here to
2023          * remain semi-compatible with present update(8) programs.
2024          */
2025         return 0;
2026 }
2027
2028 /*
2029  * This is the actual bdflush daemon itself. It used to be started from
2030  * the syscall above, but now we launch it ourselves internally with
2031  * kernel_thread(...)  directly after the first thread in init/main.c
2032  */
2033 int bdflush(void * unused)
2034 {
2035         /*
2036          *      We have a bare-bones task_struct, and really should fill
2037          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2038          *      display semi-sane things. Not real crucial though...
2039          */
2040
2041         current->session = 1;
2042         current->pgrp = 1;
2043         sprintf(current->comm, "kflushd");
2044         bdflush_tsk = current;
2045
2046         for (;;) {
2047                 int nlist;
2048
2049                 CHECK_EMERGENCY_SYNC
2050
2051                 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2052                         int nr, major, written = 0;
2053                         struct buffer_head *next;
2054
2055                 repeat:
2056                         spin_lock(&lru_list_lock);
2057                         next = lru_list[nlist];
2058                         nr = nr_buffers_type[nlist];
2059                         while (nr-- > 0) {
2060                                 struct buffer_head *bh = next;
2061
2062                                 next = next->b_next_free;
2063
2064                                 /* If the buffer is not on the correct list,
2065                                  * then refile it.
2066                                  */
2067                                 if ((nlist == BUF_DIRTY &&
2068                                      (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2069                                     (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2070                                         __refile_buffer(bh);
2071                                         continue;
2072                                 }
2073
2074                                 /* If we aren't in panic mode, don't write out too much
2075                                  * at a time. Also, don't write out buffers we don't
2076                                  * really have to write out yet..
2077                                  */
2078                                 if (!too_many_dirty_buffers) {
2079                                         if (written > bdf_prm.b_un.ndirty)
2080                                                 break;
2081                                         if (time_before(jiffies, bh->b_flushtime))
2082                                                 continue;
2083                                 }
2084
2085                                 if (buffer_locked(bh) || !buffer_dirty(bh))
2086                                          continue;
2087
2088                                 major = MAJOR(bh->b_dev);
2089                                 written++;
2090                                 bh->b_flushtime = 0;
2091
2092                                 /*
2093                                  * For the loop major we can try to do asynchronous writes,
2094                                  * but we have to guarantee that we're making some progress..
2095                                  */
2096                                 atomic_inc(&bh->b_count);
2097                                 spin_unlock(&lru_list_lock);
2098                                 if (major == LOOP_MAJOR && written > 1) {
2099                                         ll_rw_block(WRITEA, 1, &bh);
2100                                         if (buffer_dirty(bh))
2101                                                 --written;
2102                                 } else
2103                                         ll_rw_block(WRITE, 1, &bh);
2104                                 atomic_dec(&bh->b_count);
2105                                 goto repeat;
2106                         }
2107                         spin_unlock(&lru_list_lock);
2108                 }
2109                 run_task_queue(&tq_disk);
2110                 wake_up(&bdflush_done);
2111
2112                 /*
2113                  * If there are still a lot of dirty buffers around,
2114                  * skip the sleep and flush some more. Otherwise, we
2115                  * sleep for a while and mark us as not being in panic
2116                  * mode..
2117                  */
2118                 if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2119                         too_many_dirty_buffers = 0;
2120                         spin_lock_irq(&current->sigmask_lock);
2121                         flush_signals(current);
2122                         spin_unlock_irq(&current->sigmask_lock);
2123                         interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
2124                 }
2125         }
2126 }