fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 #include <linux/sched.h>
  30 #include <linux/fs.h>
  31 #include <linux/malloc.h>
  32 #include <linux/locks.h>
  33 #include <linux/errno.h>
  34 #include <linux/swap.h>
  35 #include <linux/swapctl.h>
  36 #include <linux/smp_lock.h>
  37 #include <linux/vmalloc.h>
  38 #include <linux/blkdev.h>
  39 #include <linux/sysrq.h>
  40 #include <linux/file.h>
  41 #include <linux/init.h>
  42 #include <linux/quotaops.h>
  43
  44 #include <asm/uaccess.h>
  45 #include <asm/io.h>
  46 #include <asm/bitops.h>
  47
  48 #define NR_SIZES 7
  49 static char buffersize_index[65] =
  50 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  51   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  52   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  53  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  54   6};
  55
  56 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  57 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
  58 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  59 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  60                                              number of unused buffer heads */
  61
  62 /* Anti-deadlock ordering:
  63  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  64  */
  65
  66 /*
  67  * Hash table gook..
  68  */
  69 static unsigned int bh_hash_mask = 0;
  70 static unsigned int bh_hash_shift = 0;
  71 static struct buffer_head **hash_table;
  72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  73
  74 static struct buffer_head *lru_list[NR_LIST];
  75 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  76 static int nr_buffers_type[NR_LIST] = {0,};
  77
  78 static struct buffer_head * unused_list = NULL;
  79 static int nr_unused_buffer_heads = 0;
  80 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  81 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  82
  83 struct bh_free_head {
  84         struct buffer_head *list;
  85         spinlock_t lock;
  86 };
  87 static struct bh_free_head free_list[NR_SIZES];
  88
  89 static kmem_cache_t *bh_cachep;
  90
  91 static int grow_buffers(int size);
  92
  93 /* This is used by some architectures to estimate available memory. */
  94 atomic_t buffermem = ATOMIC_INIT(0);
  95
  96 /* Here is the parameter block for the bdflush process. If you add or
  97  * remove any of the parameters, make sure to update kernel/sysctl.c.
  98  */
  99
 100 #define N_PARAM 9
 101
 102 /* The dummy values in this structure are left in there for compatibility
 103  * with old programs that play with the /proc entries.
 104  */
 105 union bdflush_param {
 106         struct {
 107                 int nfract;  /* Percentage of buffer cache dirty to
 108                                 activate bdflush */
 109                 int ndirty;  /* Maximum number of dirty blocks to write out per
 110                                 wake-cycle */
 111                 int nrefill; /* Number of clean buffers to try to obtain
 112                                 each time we call refill */
 113                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 114                                   when trying to refill buffers. */
 115                 int dummy1;    /* unused */
 116                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 117                 int age_super;  /* Time for superblock to age before we flush it */
 118                 int dummy2;    /* unused */
 119                 int dummy3;    /* unused */
 120         } b_un;
 121         unsigned int data[N_PARAM];
 122 } bdf_prm = {{40, 500, 64, 256, 15, 30*HZ, 5*HZ, 1884, 2}};
 123
 124 /* These are the min and max parameter values that we will allow to be assigned */
 125 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 126 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
 127
 128 void wakeup_bdflush(int);
 129
 130 /*
 131  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 132  * and getting rid of the cli-sti pairs. The wait-queue routines still
 133  * need cli-sti, but now it's just a couple of 386 instructions or so.
 134  *
 135  * Note that the real wait_on_buffer() is an inline function that checks
 136  * if 'b_wait' is set before calling this, so that the queues aren't set
 137  * up unnecessarily.
 138  */
 139 void __wait_on_buffer(struct buffer_head * bh)
 140 {
 141         struct task_struct *tsk = current;
 142         DECLARE_WAITQUEUE(wait, tsk);
 143
 144         atomic_inc(&bh->b_count);
 145         add_wait_queue(&bh->b_wait, &wait);
 146 repeat:
 147         tsk->state = TASK_UNINTERRUPTIBLE;
 148         run_task_queue(&tq_disk);
 149         if (buffer_locked(bh)) {
 150                 schedule();
 151                 goto repeat;
 152         }
 153         tsk->state = TASK_RUNNING;
 154         remove_wait_queue(&bh->b_wait, &wait);
 155         atomic_dec(&bh->b_count);
 156 }
 157
 158 /* Call sync_buffers with wait!=0 to ensure that the call does not
 159  * return until all buffer writes have completed.  Sync() may return
 160  * before the writes have finished; fsync() may not.
 161  */
 162
 163 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 164  * spontaneously dirty themselves without ever brelse being called.
 165  * We will ultimately want to put these in a separate list, but for
 166  * now we search all of the lists for dirty buffers.
 167  */
 168 static int sync_buffers(kdev_t dev, int wait)
 169 {
 170         int i, retry, pass = 0, err = 0;
 171         struct buffer_head * bh, *next;
 172
 173         /* One pass for no-wait, three for wait:
 174          * 0) write out all dirty, unlocked buffers;
 175          * 1) write out all dirty buffers, waiting if locked;
 176          * 2) wait for completion by waiting for all buffers to unlock.
 177          */
 178         do {
 179                 retry = 0;
 180
 181                 /* We search all lists as a failsafe mechanism, not because we expect
 182                  * there to be dirty buffers on any of the other lists.
 183                  */
 184 repeat:
 185                 spin_lock(&lru_list_lock);
 186                 bh = lru_list[BUF_DIRTY];
 187                 if (!bh)
 188                         goto repeat2;
 189
 190                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 191                         next = bh->b_next_free;
 192
 193                         if (!lru_list[BUF_DIRTY])
 194                                 break;
 195                         if (dev && bh->b_dev != dev)
 196                                 continue;
 197                         if (buffer_locked(bh)) {
 198                                 /* Buffer is locked; skip it unless wait is
 199                                  * requested AND pass > 0.
 200                                  */
 201                                 if (!wait || !pass) {
 202                                         retry = 1;
 203                                         continue;
 204                                 }
 205                                 atomic_inc(&bh->b_count);
 206                                 spin_unlock(&lru_list_lock);
 207                                 wait_on_buffer (bh);
 208                                 atomic_dec(&bh->b_count);
 209                                 goto repeat;
 210                         }
 211
 212                         /* If an unlocked buffer is not uptodate, there has
 213                          * been an IO error. Skip it.
 214                          */
 215                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 216                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 217                                 err = -EIO;
 218                                 continue;
 219                         }
 220
 221                         /* Don't write clean buffers.  Don't write ANY buffers
 222                          * on the third pass.
 223                          */
 224                         if (!buffer_dirty(bh) || pass >= 2)
 225                                 continue;
 226
 227                         atomic_inc(&bh->b_count);
 228                         bh->b_flushtime = 0;
 229                         spin_unlock(&lru_list_lock);
 230                         ll_rw_block(WRITE, 1, &bh);
 231                         atomic_dec(&bh->b_count);
 232                         retry = 1;
 233                         goto repeat;
 234                 }
 235
 236     repeat2:
 237                 bh = lru_list[BUF_LOCKED];
 238                 if (!bh) {
 239                         spin_unlock(&lru_list_lock);
 240                         break;
 241                 }
 242                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 243                         next = bh->b_next_free;
 244
 245                         if (!lru_list[BUF_LOCKED])
 246                                 break;
 247                         if (dev && bh->b_dev != dev)
 248                                 continue;
 249                         if (buffer_locked(bh)) {
 250                                 /* Buffer is locked; skip it unless wait is
 251                                  * requested AND pass > 0.
 252                                  */
 253                                 if (!wait || !pass) {
 254                                         retry = 1;
 255                                         continue;
 256                                 }
 257                                 atomic_inc(&bh->b_count);
 258                                 spin_unlock(&lru_list_lock);
 259                                 wait_on_buffer (bh);
 260                                 spin_lock(&lru_list_lock);
 261                                 atomic_dec(&bh->b_count);
 262                                 goto repeat2;
 263                         }
 264                 }
 265                 spin_unlock(&lru_list_lock);
 266
 267                 /* If we are waiting for the sync to succeed, and if any dirty
 268                  * blocks were written, then repeat; on the second pass, only
 269                  * wait for buffers being written (do not pass to write any
 270                  * more buffers on the second pass).
 271                  */
 272         } while (wait && retry && ++pass<=2);
 273         return err;
 274 }
 275
 276 void sync_dev(kdev_t dev)
 277 {
 278         sync_buffers(dev, 0);
 279         sync_supers(dev);
 280         sync_inodes(dev);
 281         sync_buffers(dev, 0);
 282         DQUOT_SYNC(dev);
 283         /*
 284          * FIXME(eric) we need to sync the physical devices here.
 285          * This is because some (scsi) controllers have huge amounts of
 286          * cache onboard (hundreds of Mb), and we need to instruct
 287          * them to commit all of the dirty memory to disk, and we should
 288          * not return until this has happened.
 289          *
 290          * This would need to get implemented by going through the assorted
 291          * layers so that each block major number can be synced, and this
 292          * would call down into the upper and mid-layer scsi.
 293          */
 294 }
 295
 296 int fsync_dev(kdev_t dev)
 297 {
 298         sync_buffers(dev, 0);
 299
 300         lock_kernel();
 301         sync_supers(dev);
 302         sync_inodes(dev);
 303         DQUOT_SYNC(dev);
 304         unlock_kernel();
 305
 306         return sync_buffers(dev, 1);
 307 }
 308
 309 asmlinkage int sys_sync(void)
 310 {
 311         fsync_dev(0);
 312         return 0;
 313 }
 314
 315 /*
 316  *      filp may be NULL if called via the msync of a vma.
 317  */
 318
 319 int file_fsync(struct file *filp, struct dentry *dentry)
 320 {
 321         struct inode * inode = dentry->d_inode;
 322         struct super_block * sb;
 323         kdev_t dev;
 324
 325         /* sync the inode to buffers */
 326         write_inode_now(inode);
 327
 328         /* sync the superblock to buffers */
 329         sb = inode->i_sb;
 330         wait_on_super(sb);
 331         if (sb->s_op && sb->s_op->write_super)
 332                 sb->s_op->write_super(sb);
 333
 334         /* .. finally sync the buffers to disk */
 335         dev = inode->i_dev;
 336         return sync_buffers(dev, 1);
 337 }
 338
 339 asmlinkage int sys_fsync(unsigned int fd)
 340 {
 341         struct file * file;
 342         struct dentry * dentry;
 343         struct inode * inode;
 344         int err;
 345
 346         lock_kernel();
 347         err = -EBADF;
 348         file = fget(fd);
 349         if (!file)
 350                 goto out;
 351
 352         dentry = file->f_dentry;
 353         if (!dentry)
 354                 goto out_putf;
 355
 356         inode = dentry->d_inode;
 357         if (!inode)
 358                 goto out_putf;
 359
 360         err = -EINVAL;
 361         if (!file->f_op || !file->f_op->fsync)
 362                 goto out_putf;
 363
 364         /* We need to protect against concurrent writers.. */
 365         down(&inode->i_sem);
 366         err = file->f_op->fsync(file, dentry);
 367         up(&inode->i_sem);
 368
 369 out_putf:
 370         fput(file);
 371 out:
 372         unlock_kernel();
 373         return err;
 374 }
 375
 376 asmlinkage int sys_fdatasync(unsigned int fd)
 377 {
 378         struct file * file;
 379         struct dentry * dentry;
 380         struct inode * inode;
 381         int err;
 382
 383         lock_kernel();
 384         err = -EBADF;
 385         file = fget(fd);
 386         if (!file)
 387                 goto out;
 388
 389         dentry = file->f_dentry;
 390         if (!dentry)
 391                 goto out_putf;
 392
 393         inode = dentry->d_inode;
 394         if (!inode)
 395                 goto out_putf;
 396
 397         err = -EINVAL;
 398         if (!file->f_op || !file->f_op->fsync)
 399                 goto out_putf;
 400
 401         /* this needs further work, at the moment it is identical to fsync() */
 402         down(&inode->i_sem);
 403         err = file->f_op->fsync(file, dentry);
 404         up(&inode->i_sem);
 405
 406 out_putf:
 407         fput(file);
 408 out:
 409         unlock_kernel();
 410         return err;
 411 }
 412
 413 void invalidate_buffers(kdev_t dev)
 414 {
 415         int nlist;
 416
 417         spin_lock(&lru_list_lock);
 418         for(nlist = 0; nlist < NR_LIST; nlist++) {
 419                 struct buffer_head * bh;
 420                 int i;
 421         retry:
 422                 bh = lru_list[nlist];
 423                 if (!bh)
 424                         continue;
 425                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
 426                         if (bh->b_dev != dev)
 427                                 continue;
 428                         if (buffer_locked(bh)) {
 429                                 atomic_inc(&bh->b_count);
 430                                 spin_unlock(&lru_list_lock);
 431                                 wait_on_buffer(bh);
 432                                 spin_lock(&lru_list_lock);
 433                                 atomic_dec(&bh->b_count);
 434                                 goto retry;
 435                         }
 436                         if (atomic_read(&bh->b_count))
 437                                 continue;
 438                         bh->b_flushtime = 0;
 439                         clear_bit(BH_Protected, &bh->b_state);
 440                         clear_bit(BH_Uptodate, &bh->b_state);
 441                         clear_bit(BH_Dirty, &bh->b_state);
 442                         clear_bit(BH_Req, &bh->b_state);
 443                 }
 444         }
 445         spin_unlock(&lru_list_lock);
 446 }
 447
 448 /* After several hours of tedious analysis, the following hash
 449  * function won.  Do not mess with it... -DaveM
 450  */
 451 #define _hashfn(dev,block)      \
 452         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 453          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 454 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 455
 456 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 457 {
 458         if ((bh->b_next = *head) != NULL)
 459                 bh->b_next->b_pprev = &bh->b_next;
 460         *head = bh;
 461         bh->b_pprev = head;
 462 }
 463
 464 static __inline__ void __hash_unlink(struct buffer_head *bh)
 465 {
 466         if (bh->b_next)
 467                 bh->b_next->b_pprev = bh->b_pprev;
 468         *(bh->b_pprev) = bh->b_next;
 469         bh->b_pprev = NULL;
 470 }
 471
 472 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 473 {
 474         struct buffer_head **bhp = &lru_list[blist];
 475
 476         if(!*bhp) {
 477                 *bhp = bh;
 478                 bh->b_prev_free = bh;
 479         }
 480         bh->b_next_free = *bhp;
 481         bh->b_prev_free = (*bhp)->b_prev_free;
 482         (*bhp)->b_prev_free->b_next_free = bh;
 483         (*bhp)->b_prev_free = bh;
 484         nr_buffers_type[blist]++;
 485 }
 486
 487 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 488 {
 489         if (bh->b_prev_free || bh->b_next_free) {
 490                 bh->b_prev_free->b_next_free = bh->b_next_free;
 491                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 492                 if (lru_list[blist] == bh)
 493                         lru_list[blist] = bh->b_next_free;
 494                 if (lru_list[blist] == bh)
 495                         lru_list[blist] = NULL;
 496                 bh->b_next_free = bh->b_prev_free = NULL;
 497                 nr_buffers_type[blist]--;
 498         }
 499 }
 500
 501 static void __remove_from_free_list(struct buffer_head * bh, int index)
 502 {
 503         if(bh->b_next_free == bh)
 504                  free_list[index].list = NULL;
 505         else {
 506                 bh->b_prev_free->b_next_free = bh->b_next_free;
 507                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 508                 if (free_list[index].list == bh)
 509                          free_list[index].list = bh->b_next_free;
 510         }
 511         bh->b_next_free = bh->b_prev_free = NULL;
 512 }
 513
 514 /* The following two functions must operate atomically
 515  * because they control the visibility of a buffer head
 516  * to the rest of the kernel.
 517  */
 518 static __inline__ void __remove_from_queues(struct buffer_head *bh)
 519 {
 520         write_lock(&hash_table_lock);
 521         if (bh->b_pprev)
 522                 __hash_unlink(bh);
 523         __remove_from_lru_list(bh, bh->b_list);
 524         write_unlock(&hash_table_lock);
 525 }
 526
 527 static void insert_into_queues(struct buffer_head *bh)
 528 {
 529         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 530
 531         spin_lock(&lru_list_lock);
 532         write_lock(&hash_table_lock);
 533         __hash_link(bh, head);
 534         __insert_into_lru_list(bh, bh->b_list);
 535         write_unlock(&hash_table_lock);
 536         spin_unlock(&lru_list_lock);
 537 }
 538
 539 /* This function must only run if there are no other
 540  * references _anywhere_ to this buffer head.
 541  */
 542 static void put_last_free(struct buffer_head * bh)
 543 {
 544         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 545         struct buffer_head **bhp = &head->list;
 546
 547         spin_lock(&head->lock);
 548         bh->b_dev = B_FREE;
 549         if(!*bhp) {
 550                 *bhp = bh;
 551                 bh->b_prev_free = bh;
 552         }
 553         bh->b_next_free = *bhp;
 554         bh->b_prev_free = (*bhp)->b_prev_free;
 555         (*bhp)->b_prev_free->b_next_free = bh;
 556         (*bhp)->b_prev_free = bh;
 557         spin_unlock(&head->lock);
 558 }
 559
 560 /*
 561  * Why like this, I hear you say... The reason is race-conditions.
 562  * As we don't lock buffers (unless we are reading them, that is),
 563  * something might happen to it while we sleep (ie a read-error
 564  * will force it bad). This shouldn't really happen currently, but
 565  * the code is ready.
 566  */
 567 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 568 {
 569         struct buffer_head **head = &hash(dev, block);
 570         struct buffer_head *bh;
 571
 572         read_lock(&hash_table_lock);
 573         for(bh = *head; bh; bh = bh->b_next)
 574                 if (bh->b_blocknr == block      &&
 575                     bh->b_size    == size       &&
 576                     bh->b_dev     == dev)
 577                         break;
 578         if (bh)
 579                 atomic_inc(&bh->b_count);
 580         read_unlock(&hash_table_lock);
 581
 582         return bh;
 583 }
 584
 585 unsigned int get_hardblocksize(kdev_t dev)
 586 {
 587         /*
 588          * Get the hard sector size for the given device.  If we don't know
 589          * what it is, return 0.
 590          */
 591         if (hardsect_size[MAJOR(dev)] != NULL) {
 592                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 593                 if (blksize != 0)
 594                         return blksize;
 595         }
 596
 597         /*
 598          * We don't know what the hardware sector size for this device is.
 599          * Return 0 indicating that we don't know.
 600          */
 601         return 0;
 602 }
 603
 604 void set_blocksize(kdev_t dev, int size)
 605 {
 606         extern int *blksize_size[];
 607         int i, nlist;
 608         struct buffer_head * bh, *bhnext;
 609
 610         if (!blksize_size[MAJOR(dev)])
 611                 return;
 612
 613         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 614         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 615                 panic("Invalid blocksize passed to set_blocksize");
 616
 617         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 618                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 619                 return;
 620         }
 621         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 622                 return;
 623         sync_buffers(dev, 2);
 624         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 625
 626         /* We need to be quite careful how we do this - we are moving entries
 627          * around on the free list, and we can get in a loop if we are not careful.
 628          */
 629         for(nlist = 0; nlist < NR_LIST; nlist++) {
 630         repeat:
 631                 spin_lock(&lru_list_lock);
 632                 bh = lru_list[nlist];
 633                 for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
 634                         if(!bh)
 635                                 break;
 636
 637                         bhnext = bh->b_next_free;
 638                         if (bh->b_dev != dev)
 639                                  continue;
 640                         if (bh->b_size == size)
 641                                  continue;
 642                         if (buffer_locked(bh)) {
 643                                 atomic_inc(&bh->b_count);
 644                                 spin_unlock(&lru_list_lock);
 645                                 wait_on_buffer(bh);
 646                                 atomic_dec(&bh->b_count);
 647                                 goto repeat;
 648                         }
 649                         if (bh->b_dev == dev && bh->b_size != size) {
 650                                 clear_bit(BH_Dirty, &bh->b_state);
 651                                 clear_bit(BH_Uptodate, &bh->b_state);
 652                                 clear_bit(BH_Req, &bh->b_state);
 653                                 bh->b_flushtime = 0;
 654                         }
 655                         if (atomic_read(&bh->b_count) == 0) {
 656                                 __remove_from_queues(bh);
 657                                 put_last_free(bh);
 658                         }
 659                 }
 660                 spin_unlock(&lru_list_lock);
 661         }
 662 }
 663
 664 /*
 665  * We used to try various strange things. Let's not.
 666  */
 667 static void refill_freelist(int size)
 668 {
 669         if (!grow_buffers(size)) {
 670                 wakeup_bdflush(1);
 671                 current->policy |= SCHED_YIELD;
 672                 schedule();
 673         }
 674 }
 675
 676 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 677 {
 678         bh->b_list = BUF_CLEAN;
 679         bh->b_flushtime = 0;
 680         bh->b_end_io = handler;
 681         bh->b_dev_id = dev_id;
 682 }
 683
 684 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 685 {
 686         mark_buffer_uptodate(bh, uptodate);
 687         unlock_buffer(bh);
 688 }
 689
 690 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 691 {
 692         mark_buffer_uptodate(bh, uptodate);
 693         unlock_buffer(bh);
 694         BUG();
 695 }
 696
 697 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 698 {
 699         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 700         unsigned long flags;
 701         struct buffer_head *tmp;
 702         struct page *page;
 703         int free;
 704
 705         mark_buffer_uptodate(bh, uptodate);
 706
 707         /* This is a temporary buffer used for page I/O. */
 708         page = mem_map + MAP_NR(bh->b_data);
 709
 710         if (!uptodate)
 711                 SetPageError(page);
 712
 713         /*
 714          * Be _very_ careful from here on. Bad things can happen if
 715          * two buffer heads end IO at almost the same time and both
 716          * decide that the page is now completely done.
 717          *
 718          * Async buffer_heads are here only as labels for IO, and get
 719          * thrown away once the IO for this page is complete.  IO is
 720          * deemed complete once all buffers have been visited
 721          * (b_count==0) and are now unlocked. We must make sure that
 722          * only the _last_ buffer that decrements its count is the one
 723          * that free's the page..
 724          */
 725         spin_lock_irqsave(&page_uptodate_lock, flags);
 726         unlock_buffer(bh);
 727         atomic_dec(&bh->b_count);
 728         tmp = bh->b_this_page;
 729         while (tmp != bh) {
 730                 if (atomic_read(&tmp->b_count) &&
 731                     (tmp->b_end_io == end_buffer_io_async))
 732                         goto still_busy;
 733                 tmp = tmp->b_this_page;
 734         }
 735
 736         /* OK, the async IO on this page is complete. */
 737         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 738
 739         /*
 740          * if none of the buffers had errors then we can set the
 741          * page uptodate:
 742          */
 743         if (!PageError(page))
 744                 SetPageUptodate(page);
 745
 746         /*
 747          * Run the hooks that have to be done when a page I/O has completed.
 748          *
 749          * Note - we need to test the flags before we unlock the page, but
 750          * we must not actually free the page until after the unlock!
 751          */
 752         if (test_and_clear_bit(PG_decr_after, &page->flags))
 753                 atomic_dec(&nr_async_pages);
 754
 755         if (test_and_clear_bit(PG_free_swap_after, &page->flags))
 756                 swap_free(page->offset);
 757
 758         free = test_and_clear_bit(PG_free_after, &page->flags);
 759
 760         if (page->owner != -1)
 761                 PAGE_BUG(page);
 762         page->owner = (int)current;
 763         UnlockPage(page);
 764
 765         if (free)
 766                 __free_page(page);
 767
 768         return;
 769
 770 still_busy:
 771         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 772         return;
 773 }
 774
 775
 776 /*
 777  * Ok, this is getblk, and it isn't very clear, again to hinder
 778  * race-conditions. Most of the code is seldom used, (ie repeating),
 779  * so it should be much more efficient than it looks.
 780  *
 781  * The algorithm is changed: hopefully better, and an elusive bug removed.
 782  *
 783  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 784  * when the filesystem starts to get full of dirty blocks (I hope).
 785  */
 786 struct buffer_head * getblk(kdev_t dev, int block, int size)
 787 {
 788         struct buffer_head * bh;
 789         int isize;
 790
 791 repeat:
 792         bh = get_hash_table(dev, block, size);
 793         if (bh) {
 794                 if (!buffer_dirty(bh)) {
 795                         bh->b_flushtime = 0;
 796                 }
 797                 goto out;
 798         }
 799
 800         isize = BUFSIZE_INDEX(size);
 801         spin_lock(&free_list[isize].lock);
 802         bh = free_list[isize].list;
 803         if (bh) {
 804                 __remove_from_free_list(bh, isize);
 805                 atomic_set(&bh->b_count, 1);
 806         }
 807         spin_unlock(&free_list[isize].lock);
 808         if (!bh)
 809                 goto refill;
 810
 811         /* OK, FINALLY we know that this buffer is the only one of its kind,
 812          * we hold a reference (b_count>0), it is unlocked, and it is clean.
 813          */
 814         init_buffer(bh, end_buffer_io_sync, NULL);
 815         bh->b_dev = dev;
 816         bh->b_blocknr = block;
 817         bh->b_state = 1 << BH_Mapped;
 818
 819         /* Insert the buffer into the regular lists */
 820         insert_into_queues(bh);
 821         goto out;
 822
 823         /*
 824          * If we block while refilling the free list, somebody may
 825          * create the buffer first ... search the hashes again.
 826          */
 827 refill:
 828         refill_freelist(size);
 829         goto repeat;
 830 out:
 831         return bh;
 832 }
 833
 834 /*
 835  * if a new dirty buffer is created we need to balance bdflush.
 836  *
 837  * in the future we might want to make bdflush aware of different
 838  * pressures on different devices - thus the (currently unused)
 839  * 'dev' parameter.
 840  */
 841 int too_many_dirty_buffers;
 842
 843 void balance_dirty(kdev_t dev)
 844 {
 845         int dirty = nr_buffers_type[BUF_DIRTY];
 846         int ndirty = bdf_prm.b_un.ndirty;
 847
 848         if (dirty > ndirty) {
 849                 if (dirty > 2*ndirty) {
 850                         too_many_dirty_buffers = 1;
 851                         wakeup_bdflush(1);
 852                         return;
 853                 }
 854                 wakeup_bdflush(0);
 855         }
 856         too_many_dirty_buffers = 0;
 857         return;
 858 }
 859
 860 static inline void __mark_dirty(struct buffer_head *bh, int flag)
 861 {
 862         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 863         clear_bit(BH_New, &bh->b_state);
 864         refile_buffer(bh);
 865 }
 866
 867 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 868 {
 869         __mark_dirty(bh, flag);
 870 }
 871
 872 /*
 873  * A buffer may need to be moved from one buffer list to another
 874  * (e.g. in case it is not shared any more). Handle this.
 875  */
 876 static __inline__ void __refile_buffer(struct buffer_head *bh)
 877 {
 878         int dispose = BUF_CLEAN;
 879         if (buffer_locked(bh))
 880                 dispose = BUF_LOCKED;
 881         if (buffer_dirty(bh))
 882                 dispose = BUF_DIRTY;
 883         if (dispose != bh->b_list) {
 884                 __remove_from_lru_list(bh, bh->b_list);
 885                 bh->b_list = dispose;
 886                 __insert_into_lru_list(bh, dispose);
 887         }
 888 }
 889
 890 void refile_buffer(struct buffer_head *bh)
 891 {
 892         spin_lock(&lru_list_lock);
 893         __refile_buffer(bh);
 894         spin_unlock(&lru_list_lock);
 895 }
 896
 897 /*
 898  * Release a buffer head
 899  */
 900 void __brelse(struct buffer_head * buf)
 901 {
 902         touch_buffer(buf);
 903
 904         if (atomic_read(&buf->b_count)) {
 905                 atomic_dec(&buf->b_count);
 906                 return;
 907         }
 908         printk("VFS: brelse: Trying to free free buffer\n");
 909 }
 910
 911 /*
 912  * bforget() is like brelse(), except it puts the buffer on the
 913  * free list if it can.. We can NOT free the buffer if:
 914  *  - there are other users of it
 915  *  - it is locked and thus can have active IO
 916  */
 917 void __bforget(struct buffer_head * buf)
 918 {
 919         spin_lock(&lru_list_lock);
 920         write_lock(&hash_table_lock);
 921         if (atomic_read(&buf->b_count) != 1 || buffer_locked(buf)) {
 922                 touch_buffer(buf);
 923                 atomic_dec(&buf->b_count);
 924         } else {
 925                 atomic_set(&buf->b_count, 0);
 926                 buf->b_state = 0;
 927                 if (buf->b_pprev)
 928                         __hash_unlink(buf);
 929                 __remove_from_lru_list(buf, buf->b_list);
 930                 put_last_free(buf);
 931         }
 932         write_unlock(&hash_table_lock);
 933         spin_unlock(&lru_list_lock);
 934 }
 935
 936 /*
 937  * bread() reads a specified block and returns the buffer that contains
 938  * it. It returns NULL if the block was unreadable.
 939  */
 940 struct buffer_head * bread(kdev_t dev, int block, int size)
 941 {
 942         struct buffer_head * bh;
 943
 944         bh = getblk(dev, block, size);
 945         if (buffer_uptodate(bh))
 946                 return bh;
 947         ll_rw_block(READ, 1, &bh);
 948         wait_on_buffer(bh);
 949         if (buffer_uptodate(bh))
 950                 return bh;
 951         brelse(bh);
 952         return NULL;
 953 }
 954
 955 /*
 956  * Ok, breada can be used as bread, but additionally to mark other
 957  * blocks for reading as well. End the argument list with a negative
 958  * number.
 959  */
 960
 961 #define NBUF 16
 962
 963 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 964         unsigned int pos, unsigned int filesize)
 965 {
 966         struct buffer_head * bhlist[NBUF];
 967         unsigned int blocks;
 968         struct buffer_head * bh;
 969         int index;
 970         int i, j;
 971
 972         if (pos >= filesize)
 973                 return NULL;
 974
 975         if (block < 0)
 976                 return NULL;
 977
 978         bh = getblk(dev, block, bufsize);
 979         index = BUFSIZE_INDEX(bh->b_size);
 980
 981         if (buffer_uptodate(bh))
 982                 return(bh);
 983         else ll_rw_block(READ, 1, &bh);
 984
 985         blocks = (filesize - pos) >> (9+index);
 986
 987         if (blocks < (read_ahead[MAJOR(dev)] >> index))
 988                 blocks = read_ahead[MAJOR(dev)] >> index;
 989         if (blocks > NBUF)
 990                 blocks = NBUF;
 991
 992 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
 993
 994         bhlist[0] = bh;
 995         j = 1;
 996         for(i=1; i<blocks; i++) {
 997                 bh = getblk(dev,block+i,bufsize);
 998                 if (buffer_uptodate(bh)) {
 999                         brelse(bh);
1000                         break;
1001                 }
1002                 else bhlist[j++] = bh;
1003         }
1004
1005         /* Request the read for these buffers, and then release them. */
1006         if (j>1)
1007                 ll_rw_block(READA, (j-1), bhlist+1);
1008         for(i=1; i<j; i++)
1009                 brelse(bhlist[i]);
1010
1011         /* Wait for this buffer, and then continue on. */
1012         bh = bhlist[0];
1013         wait_on_buffer(bh);
1014         if (buffer_uptodate(bh))
1015                 return bh;
1016         brelse(bh);
1017         return NULL;
1018 }
1019
1020 /*
1021  * Note: the caller should wake up the buffer_wait list if needed.
1022  */
1023 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1024 {
1025         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1026                 kmem_cache_free(bh_cachep, bh);
1027         } else {
1028                 bh->b_blocknr = -1;
1029                 init_waitqueue_head(&bh->b_wait);
1030                 nr_unused_buffer_heads++;
1031                 bh->b_next_free = unused_list;
1032                 bh->b_this_page = NULL;
1033                 unused_list = bh;
1034         }
1035 }
1036
1037 static void put_unused_buffer_head(struct buffer_head *bh)
1038 {
1039         spin_lock(&unused_list_lock);
1040         __put_unused_buffer_head(bh);
1041         spin_unlock(&unused_list_lock);
1042 }
1043
1044 /*
1045  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1046  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1047  * buffer heads is now handled in create_buffers().
1048  */
1049 static struct buffer_head * get_unused_buffer_head(int async)
1050 {
1051         struct buffer_head * bh;
1052
1053         spin_lock(&unused_list_lock);
1054         if (nr_unused_buffer_heads > NR_RESERVED) {
1055                 bh = unused_list;
1056                 unused_list = bh->b_next_free;
1057                 nr_unused_buffer_heads--;
1058                 spin_unlock(&unused_list_lock);
1059                 return bh;
1060         }
1061         spin_unlock(&unused_list_lock);
1062
1063         /* This is critical.  We can't swap out pages to get
1064          * more buffer heads, because the swap-out may need
1065          * more buffer-heads itself.  Thus SLAB_BUFFER.
1066          */
1067         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1068                 memset(bh, 0, sizeof(*bh));
1069                 init_waitqueue_head(&bh->b_wait);
1070                 return bh;
1071         }
1072
1073         /*
1074          * If we need an async buffer, use the reserved buffer heads.
1075          */
1076         if (async) {
1077                 spin_lock(&unused_list_lock);
1078                 if (unused_list) {
1079                         bh = unused_list;
1080                         unused_list = bh->b_next_free;
1081                         nr_unused_buffer_heads--;
1082                         spin_unlock(&unused_list_lock);
1083                         return bh;
1084                 }
1085                 spin_unlock(&unused_list_lock);
1086         }
1087 #if 0
1088         /*
1089          * (Pending further analysis ...)
1090          * Ordinary (non-async) requests can use a different memory priority
1091          * to free up pages. Any swapping thus generated will use async
1092          * buffer heads.
1093          */
1094         if(!async &&
1095            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1096                 memset(bh, 0, sizeof(*bh));
1097                 init_waitqueue_head(&bh->b_wait);
1098                 return bh;
1099         }
1100 #endif
1101
1102         return NULL;
1103 }
1104
1105 /*
1106  * Create the appropriate buffers when given a page for data area and
1107  * the size of each buffer.. Use the bh->b_this_page linked list to
1108  * follow the buffers created.  Return NULL if unable to create more
1109  * buffers.
1110  * The async flag is used to differentiate async IO (paging, swapping)
1111  * from ordinary buffer allocations, and only async requests are allowed
1112  * to sleep waiting for buffer heads.
1113  */
1114 static struct buffer_head * create_buffers(unsigned long page, unsigned long size, int async)
1115 {
1116         DECLARE_WAITQUEUE(wait, current);
1117         struct buffer_head *bh, *head;
1118         long offset;
1119
1120 try_again:
1121         head = NULL;
1122         offset = PAGE_SIZE;
1123         while ((offset -= size) >= 0) {
1124                 bh = get_unused_buffer_head(async);
1125                 if (!bh)
1126                         goto no_grow;
1127
1128                 bh->b_dev = B_FREE;  /* Flag as unused */
1129                 bh->b_this_page = head;
1130                 head = bh;
1131
1132                 bh->b_state = 0;
1133                 bh->b_next_free = NULL;
1134                 bh->b_pprev = NULL;
1135                 atomic_set(&bh->b_count, 0);
1136                 bh->b_size = size;
1137
1138                 bh->b_data = (char *) (page+offset);
1139                 bh->b_list = BUF_CLEAN;
1140                 bh->b_flushtime = 0;
1141                 bh->b_end_io = end_buffer_io_bad;
1142         }
1143         return head;
1144 /*
1145  * In case anything failed, we just free everything we got.
1146  */
1147 no_grow:
1148         if (head) {
1149                 do {
1150                         bh = head;
1151                         head = head->b_this_page;
1152                         put_unused_buffer_head(bh);
1153                 } while (head);
1154
1155                 /* Wake up any waiters ... */
1156                 wake_up(&buffer_wait);
1157         }
1158
1159         /*
1160          * Return failure for non-async IO requests.  Async IO requests
1161          * are not allowed to fail, so we have to wait until buffer heads
1162          * become available.  But we don't want tasks sleeping with
1163          * partially complete buffers, so all were released above.
1164          */
1165         if (!async)
1166                 return NULL;
1167
1168         /* We're _really_ low on memory. Now we just
1169          * wait for old buffer heads to become free due to
1170          * finishing IO.  Since this is an async request and
1171          * the reserve list is empty, we're sure there are
1172          * async buffer heads in use.
1173          */
1174         run_task_queue(&tq_disk);
1175
1176         /*
1177          * Set our state for sleeping, then check again for buffer heads.
1178          * This ensures we won't miss a wake_up from an interrupt.
1179          */
1180         add_wait_queue(&buffer_wait, &wait);
1181         current->state = TASK_UNINTERRUPTIBLE;
1182         if (nr_unused_buffer_heads < MAX_BUF_PER_PAGE) {
1183                 current->policy |= SCHED_YIELD;
1184                 schedule();
1185         }
1186         remove_wait_queue(&buffer_wait, &wait);
1187         current->state = TASK_RUNNING;
1188         goto try_again;
1189 }
1190
1191 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1192 {
1193         struct buffer_head *head, *bh, *tail;
1194         int block;
1195
1196         if (!PageLocked(page))
1197                 BUG();
1198         if (page->owner != (int)current)
1199                 PAGE_BUG(page);
1200         /*
1201          * Allocate async buffer heads pointing to this page, just for I/O.
1202          * They show up in the buffer hash table and are registered in
1203          * page->buffers.
1204          */
1205         head = create_buffers(page_address(page), size, 1);
1206         if (page->buffers)
1207                 BUG();
1208         if (!head)
1209                 BUG();
1210         tail = head;
1211         for (bh = head; bh; bh = bh->b_this_page) {
1212                 block = *(b++);
1213
1214                 tail = bh;
1215                 init_buffer(bh, end_buffer_io_async, NULL);
1216                 bh->b_dev = dev;
1217                 bh->b_blocknr = block;
1218
1219                 /*
1220                  * When we use bmap, we define block zero to represent
1221                  * a hole.  ll_rw_page, however, may legitimately
1222                  * access block zero, and we need to distinguish the
1223                  * two cases.
1224                  */
1225                 if (bmap && !block) {
1226                         memset(bh->b_data, 0, size);
1227                         set_bit(BH_Uptodate, &bh->b_state);
1228                         continue;
1229                 }
1230                 set_bit(BH_Mapped, &bh->b_state);
1231         }
1232         tail->b_this_page = head;
1233         get_page(page);
1234         page->buffers = head;
1235         return 0;
1236 }
1237
1238 /*
1239  * We don't have to release all buffers here, but
1240  * we have to be sure that no dirty buffer is left
1241  * and no IO is going on (no buffer is locked), because
1242  * we have truncated the file and are going to free the
1243  * blocks on-disk..
1244  */
1245 int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
1246 {
1247         struct buffer_head *head, *bh, *next;
1248         unsigned int curr_off = 0;
1249
1250         if (!PageLocked(page))
1251                 BUG();
1252         if (!page->buffers)
1253                 return 0;
1254
1255         head = page->buffers;
1256         bh = head;
1257         do {
1258                 unsigned int next_off = curr_off + bh->b_size;
1259                 next = bh->b_this_page;
1260
1261                 /*
1262                  * is this block fully flushed?
1263                  */
1264                 if (offset <= curr_off) {
1265                         if (buffer_mapped(bh)) {
1266                                 atomic_inc(&bh->b_count);
1267                                 wait_on_buffer(bh);
1268                                 if (bh->b_dev == B_FREE)
1269                                         BUG();
1270                                 mark_buffer_clean(bh);
1271                                 clear_bit(BH_Uptodate, &bh->b_state);
1272                                 clear_bit(BH_Mapped, &bh->b_state);
1273                                 bh->b_blocknr = 0;
1274                                 atomic_dec(&bh->b_count);
1275                         }
1276                 }
1277                 curr_off = next_off;
1278                 bh = next;
1279         } while (bh != head);
1280
1281         /*
1282          * subtle. We release buffer-heads only if this is
1283          * the 'final' flushpage. We have invalidated the bmap
1284          * cached value unconditionally, so real IO is not
1285          * possible anymore.
1286          *
1287          * If the free doesn't work out, the buffers can be
1288          * left around - they just turn into anonymous buffers
1289          * instead.
1290          */
1291         if (!offset) {
1292                 if (!try_to_free_buffers(page))
1293                         atomic_add(PAGE_CACHE_SIZE, &buffermem);
1294         }
1295
1296         return 0;
1297 }
1298
1299 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1300 {
1301         struct buffer_head *bh, *head, *tail;
1302
1303         head = create_buffers(page_address(page), blocksize, 1);
1304         if (page->buffers)
1305                 BUG();
1306
1307         bh = head;
1308         do {
1309                 bh->b_dev = inode->i_dev;
1310                 bh->b_blocknr = 0;
1311                 bh->b_end_io = end_buffer_io_bad;
1312                 tail = bh;
1313                 bh = bh->b_this_page;
1314         } while (bh);
1315         tail->b_this_page = head;
1316         page->buffers = head;
1317         get_page(page);
1318 }
1319
1320 /*
1321  * block_write_full_page() is SMP-safe - currently it's still
1322  * being called with the kernel lock held, but the code is ready.
1323  */
1324 int block_write_full_page(struct file *file, struct page *page)
1325 {
1326         struct dentry *dentry = file->f_dentry;
1327         struct inode *inode = dentry->d_inode;
1328         int err, i;
1329         unsigned long block, offset;
1330         struct buffer_head *bh, *head;
1331
1332         if (!PageLocked(page))
1333                 BUG();
1334
1335         if (!page->buffers)
1336                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1337         head = page->buffers;
1338
1339         offset = page->offset;
1340         block = offset >> inode->i_sb->s_blocksize_bits;
1341
1342         // FIXME: currently we assume page alignment.
1343         if (offset & (PAGE_SIZE-1))
1344                 BUG();
1345
1346         bh = head;
1347         i = 0;
1348         do {
1349                 if (!bh)
1350                         BUG();
1351
1352                 /*
1353                  * If the buffer isn't up-to-date, we can't be sure
1354                  * that the buffer has been initialized with the proper
1355                  * block number information etc..
1356                  *
1357                  * Leave it to the low-level FS to make all those
1358                  * decisions (block #0 may actually be a valid block)
1359                  */
1360                 bh->b_end_io = end_buffer_io_sync;
1361                 if (!buffer_mapped(bh)) {
1362                         err = inode->i_op->get_block(inode, block, bh, 1);
1363                         if (err)
1364                                 goto out;
1365                 }
1366                 set_bit(BH_Uptodate, &bh->b_state);
1367                 mark_buffer_dirty(bh,0);
1368
1369                 bh = bh->b_this_page;
1370                 block++;
1371         } while (bh != head);
1372
1373         SetPageUptodate(page);
1374         return 0;
1375 out:
1376         ClearPageUptodate(page);
1377         return err;
1378 }
1379
1380 int block_write_partial_page(struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf)
1381 {
1382         struct dentry *dentry = file->f_dentry;
1383         struct inode *inode = dentry->d_inode;
1384         unsigned long block;
1385         int err, partial;
1386         unsigned long blocksize, start_block, end_block;
1387         unsigned long start_offset, start_bytes, end_bytes;
1388         unsigned long bbits, blocks, i, len;
1389         struct buffer_head *bh, *head;
1390         char * target_buf;
1391
1392         target_buf = (char *)page_address(page) + offset;
1393
1394         if (!PageLocked(page))
1395                 BUG();
1396
1397         blocksize = inode->i_sb->s_blocksize;
1398         if (!page->buffers)
1399                 create_empty_buffers(page, inode, blocksize);
1400         head = page->buffers;
1401
1402         bbits = inode->i_sb->s_blocksize_bits;
1403         block = page->offset >> bbits;
1404         blocks = PAGE_SIZE >> bbits;
1405         start_block = offset >> bbits;
1406         end_block = (offset + bytes - 1) >> bbits;
1407         start_offset = offset & (blocksize - 1);
1408         start_bytes = blocksize - start_offset;
1409         if (start_bytes > bytes)
1410                 start_bytes = bytes;
1411         end_bytes = (offset+bytes) & (blocksize - 1);
1412         if (end_bytes > bytes)
1413                 end_bytes = bytes;
1414
1415         if (offset < 0 || offset >= PAGE_SIZE)
1416                 BUG();
1417         if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
1418                 BUG();
1419         if (start_block < 0 || start_block >= blocks)
1420                 BUG();
1421         if (end_block < 0 || end_block >= blocks)
1422                 BUG();
1423         // FIXME: currently we assume page alignment.
1424         if (page->offset & (PAGE_SIZE-1))
1425                 BUG();
1426
1427         i = 0;
1428         bh = head;
1429         partial = 0;
1430         do {
1431                 if (!bh)
1432                         BUG();
1433
1434                 if ((i < start_block) || (i > end_block)) {
1435                         if (!buffer_uptodate(bh))
1436                                 partial = 1;
1437                         goto skip;
1438                 }
1439
1440                 /*
1441                  * If the buffer is not up-to-date, we need to ask the low-level
1442                  * FS to do something for us (we used to have assumptions about
1443                  * the meaning of b_blocknr etc, that's bad).
1444                  *
1445                  * If "update" is set, that means that the low-level FS should
1446                  * try to make sure that the block is up-to-date because we're
1447                  * not going to fill it completely.
1448                  */
1449                 bh->b_end_io = end_buffer_io_sync;
1450                 if (!buffer_mapped(bh)) {
1451                         err = inode->i_op->get_block(inode, block, bh, 1);
1452                         if (err)
1453                                 goto out;
1454                 }
1455
1456                 if (!buffer_uptodate(bh) && (start_offset || (end_bytes && (i == end_block)))) {
1457                         if (buffer_new(bh)) {
1458                                 memset(bh->b_data, 0, bh->b_size);
1459                         } else {
1460                                 ll_rw_block(READ, 1, &bh);
1461                                 wait_on_buffer(bh);
1462                                 err = -EIO;
1463                                 if (!buffer_uptodate(bh))
1464                                         goto out;
1465                         }
1466                 }
1467
1468                 len = blocksize;
1469                 if (start_offset) {
1470                         len = start_bytes;
1471                         start_offset = 0;
1472                 } else if (end_bytes && (i == end_block)) {
1473                         len = end_bytes;
1474                         end_bytes = 0;
1475                 }
1476                 err = copy_from_user(target_buf, buf, len);
1477                 target_buf += len;
1478                 buf += len;
1479
1480                 /*
1481                  * we dirty buffers only after copying the data into
1482                  * the page - this way we can dirty the buffer even if
1483                  * the bh is still doing IO.
1484                  *
1485                  * NOTE! This also does a direct dirty balace check,
1486                  * rather than relying on bdflush just waking up every
1487                  * once in a while. This is to catch (and slow down)
1488                  * the processes that write tons of buffer..
1489                  *
1490                  * Note how we do NOT want to do this in the full block
1491                  * case: full pages are flushed not by the people who
1492                  * dirtied them, but by people who need memory. And we
1493                  * should not penalize them for somebody else writing
1494                  * lots of dirty pages.
1495                  */
1496                 set_bit(BH_Uptodate, &bh->b_state);
1497                 if (!test_and_set_bit(BH_Dirty, &bh->b_state)) {
1498                         __mark_dirty(bh, 0);
1499                         if (too_many_dirty_buffers)
1500                                 balance_dirty(bh->b_dev);
1501                 }
1502
1503                 if (err) {
1504                         err = -EFAULT;
1505                         goto out;
1506                 }
1507
1508 skip:
1509                 i++;
1510                 block++;
1511                 bh = bh->b_this_page;
1512         } while (bh != head);
1513
1514         /*
1515          * is this a partial write that happened to make all buffers
1516          * uptodate then we can optimize away a bogus readpage() for
1517          * the next read(). Here we 'discover' wether the page went
1518          * uptodate as a result of this (potentially partial) write.
1519          */
1520         if (!partial)
1521                 SetPageUptodate(page);
1522         return bytes;
1523 out:
1524         ClearPageUptodate(page);
1525         return err;
1526 }
1527
1528 /*
1529  * Start I/O on a page.
1530  * This function expects the page to be locked and may return
1531  * before I/O is complete. You then have to check page->locked,
1532  * page->uptodate, and maybe wait on page->wait.
1533  *
1534  * brw_page() is SMP-safe, although it's being called with the
1535  * kernel lock held - but the code is ready.
1536  *
1537  * FIXME: we need a swapper_inode->get_block function to remove
1538  *        some of the bmap kludges and interface ugliness here.
1539  */
1540 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
1541 {
1542         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1543         int nr, fresh /* temporary debugging flag */, block;
1544
1545         if (!PageLocked(page))
1546                 panic("brw_page: page not locked for I/O");
1547 //      clear_bit(PG_error, &page->flags);
1548         /*
1549          * We pretty much rely on the page lock for this, because
1550          * create_page_buffers() might sleep.
1551          */
1552         fresh = 0;
1553         if (!page->buffers) {
1554                 create_page_buffers(rw, page, dev, b, size, bmap);
1555                 fresh = 1;
1556         }
1557         if (!page->buffers)
1558                 BUG();
1559         page->owner = -1;
1560
1561         head = page->buffers;
1562         bh = head;
1563         nr = 0;
1564         do {
1565                 block = *(b++);
1566
1567                 if (fresh && (atomic_read(&bh->b_count) != 0))
1568                         BUG();
1569                 if (rw == READ) {
1570                         if (!fresh)
1571                                 BUG();
1572                         if (bmap && !block) {
1573                                 if (block)
1574                                         BUG();
1575                         } else {
1576                                 if (bmap && !block)
1577                                         BUG();
1578                                 if (!buffer_uptodate(bh)) {
1579                                         arr[nr++] = bh;
1580                                         atomic_inc(&bh->b_count);
1581                                 }
1582                         }
1583                 } else { /* WRITE */
1584                         if (!bh->b_blocknr) {
1585                                 if (!block)
1586                                         BUG();
1587                                 bh->b_blocknr = block;
1588                         } else {
1589                                 if (!block)
1590                                         BUG();
1591                         }
1592                         set_bit(BH_Uptodate, &bh->b_state);
1593                         set_bit(BH_Dirty, &bh->b_state);
1594                         arr[nr++] = bh;
1595                         atomic_inc(&bh->b_count);
1596                 }
1597                 bh = bh->b_this_page;
1598         } while (bh != head);
1599         if (rw == READ)
1600                 ++current->mm->maj_flt;
1601         if ((rw == READ) && nr) {
1602                 if (Page_Uptodate(page))
1603                         BUG();
1604                 ll_rw_block(rw, nr, arr);
1605         } else {
1606                 if (!nr && rw == READ) {
1607                         SetPageUptodate(page);
1608                         page->owner = (int)current;
1609                         UnlockPage(page);
1610                 }
1611                 if (nr && (rw == WRITE))
1612                         ll_rw_block(rw, nr, arr);
1613         }
1614         return 0;
1615 }
1616
1617 /*
1618  * Generic "read page" function for block devices that have the normal
1619  * bmap functionality. This is most of the block device filesystems.
1620  * Reads the page asynchronously --- the unlock_buffer() and
1621  * mark_buffer_uptodate() functions propagate buffer state into the
1622  * page struct once IO has completed.
1623  */
1624 int block_read_full_page(struct file * file, struct page * page)
1625 {
1626         struct dentry *dentry = file->f_dentry;
1627         struct inode *inode = dentry->d_inode;
1628         unsigned long iblock;
1629         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1630         unsigned int blocksize, blocks;
1631         int nr;
1632
1633         if (!PageLocked(page))
1634                 PAGE_BUG(page);
1635         blocksize = inode->i_sb->s_blocksize;
1636         if (!page->buffers)
1637                 create_empty_buffers(page, inode, blocksize);
1638         head = page->buffers;
1639
1640         blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
1641         iblock = page->offset >> inode->i_sb->s_blocksize_bits;
1642         page->owner = -1;
1643         head = page->buffers;
1644         bh = head;
1645         nr = 0;
1646
1647         do {
1648                 if (buffer_uptodate(bh))
1649                         continue;
1650
1651                 if (!buffer_mapped(bh)) {
1652                         inode->i_op->get_block(inode, iblock, bh, 0);
1653                         if (!buffer_mapped(bh)) {
1654                                 memset(bh->b_data, 0, blocksize);
1655                                 set_bit(BH_Uptodate, &bh->b_state);
1656                                 continue;
1657                         }
1658                 }
1659
1660                 init_buffer(bh, end_buffer_io_async, NULL);
1661                 atomic_inc(&bh->b_count);
1662                 arr[nr] = bh;
1663                 nr++;
1664         } while (iblock++, (bh = bh->b_this_page) != head);
1665
1666         ++current->mm->maj_flt;
1667         if (nr) {
1668                 if (Page_Uptodate(page))
1669                         BUG();
1670                 ll_rw_block(READ, nr, arr);
1671         } else {
1672                 /*
1673                  * all buffers are uptodate - we can set the page
1674                  * uptodate as well.
1675                  */
1676                 SetPageUptodate(page);
1677                 page->owner = (int)current;
1678                 UnlockPage(page);
1679         }
1680         return 0;
1681 }
1682
1683 /*
1684  * Try to increase the number of buffers available: the size argument
1685  * is used to determine what kind of buffers we want.
1686  */
1687 static int grow_buffers(int size)
1688 {
1689         unsigned long page;
1690         struct buffer_head *bh, *tmp;
1691         struct buffer_head * insert_point;
1692         int isize;
1693
1694         if ((size & 511) || (size > PAGE_SIZE)) {
1695                 printk("VFS: grow_buffers: size = %d\n",size);
1696                 return 0;
1697         }
1698
1699         if (!(page = __get_free_page(GFP_BUFFER)))
1700                 return 0;
1701         bh = create_buffers(page, size, 0);
1702         if (!bh) {
1703                 free_page(page);
1704                 return 0;
1705         }
1706
1707         isize = BUFSIZE_INDEX(size);
1708
1709         spin_lock(&free_list[isize].lock);
1710         insert_point = free_list[isize].list;
1711         tmp = bh;
1712         while (1) {
1713                 if (insert_point) {
1714                         tmp->b_next_free = insert_point->b_next_free;
1715                         tmp->b_prev_free = insert_point;
1716                         insert_point->b_next_free->b_prev_free = tmp;
1717                         insert_point->b_next_free = tmp;
1718                 } else {
1719                         tmp->b_prev_free = tmp;
1720                         tmp->b_next_free = tmp;
1721                 }
1722                 insert_point = tmp;
1723                 if (tmp->b_this_page)
1724                         tmp = tmp->b_this_page;
1725                 else
1726                         break;
1727         }
1728         tmp->b_this_page = bh;
1729         free_list[isize].list = bh;
1730         spin_unlock(&free_list[isize].lock);
1731
1732         mem_map[MAP_NR(page)].buffers = bh;
1733         atomic_add(PAGE_SIZE, &buffermem);
1734         return 1;
1735 }
1736
1737 /*
1738  * Can the buffer be thrown out?
1739  */
1740 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
1741 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
1742
1743 /*
1744  * try_to_free_buffers() checks if all the buffers on this particular page
1745  * are unused, and free's the page if so.
1746  *
1747  * Wake up bdflush() if this fails - if we're running low on memory due
1748  * to dirty buffers, we need to flush them out as quickly as possible.
1749  *
1750  * NOTE: There are quite a number of ways that threads of control can
1751  *       obtain a reference to a buffer head within a page.  So we must
1752  *       lock out all of these paths to cleanly toss the page.
1753  */
1754 int try_to_free_buffers(struct page * page)
1755 {
1756         struct buffer_head * tmp, * bh = page->buffers;
1757         int index = BUFSIZE_INDEX(bh->b_size);
1758         int ret;
1759
1760         spin_lock(&lru_list_lock);
1761         write_lock(&hash_table_lock);
1762         spin_lock(&free_list[index].lock);
1763         tmp = bh;
1764         do {
1765                 struct buffer_head * p = tmp;
1766
1767                 tmp = tmp->b_this_page;
1768                 if (buffer_busy(p))
1769                         goto busy_buffer_page;
1770         } while (tmp != bh);
1771
1772         spin_lock(&unused_list_lock);
1773         tmp = bh;
1774         do {
1775                 struct buffer_head * p = tmp;
1776                 tmp = tmp->b_this_page;
1777
1778                 /* The buffer can be either on the regular
1779                  * queues or on the free list..
1780                  */
1781                 if (p->b_dev == B_FREE) {
1782                         __remove_from_free_list(p, index);
1783                 } else {
1784                         if (p->b_pprev)
1785                                 __hash_unlink(p);
1786                         __remove_from_lru_list(p, p->b_list);
1787                 }
1788                 __put_unused_buffer_head(p);
1789         } while (tmp != bh);
1790         spin_unlock(&unused_list_lock);
1791
1792         /* Wake up anyone waiting for buffer heads */
1793         wake_up(&buffer_wait);
1794
1795         /* And free the page */
1796         page->buffers = NULL;
1797         __free_page(page);
1798         ret = 1;
1799 out:
1800         spin_unlock(&free_list[index].lock);
1801         write_unlock(&hash_table_lock);
1802         spin_unlock(&lru_list_lock);
1803         return ret;
1804
1805 busy_buffer_page:
1806         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
1807         too_many_dirty_buffers = 1;
1808         wakeup_bdflush(0);
1809         ret = 0;
1810         goto out;
1811 }
1812
1813 /* ===================== Init ======================= */
1814
1815 /*
1816  * allocate the hash table and init the free list
1817  * Use gfp() for the hash table to decrease TLB misses, use
1818  * SLAB cache for buffer heads.
1819  */
1820 void __init buffer_init(unsigned long memory_size)
1821 {
1822         int order, i;
1823         unsigned int nr_hash;
1824
1825         /* The buffer cache hash table is less important these days,
1826          * trim it a bit.
1827          */
1828         memory_size >>= 14;
1829         memory_size *= sizeof(struct buffer_head *);
1830         for (order = 0; (PAGE_SIZE << order) < memory_size; order++)
1831                 ;
1832
1833         /* try to allocate something until we get it or we're asking
1834            for something that is really too small */
1835
1836         do {
1837                 unsigned long tmp;
1838
1839                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
1840                 bh_hash_mask = (nr_hash - 1);
1841
1842                 tmp = nr_hash;
1843                 bh_hash_shift = 0;
1844                 while((tmp >>= 1UL) != 0UL)
1845                         bh_hash_shift++;
1846
1847                 hash_table = (struct buffer_head **)
1848                     __get_free_pages(GFP_ATOMIC, order);
1849         } while (hash_table == NULL && --order > 0);
1850         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
1851                nr_hash, order, (1UL<<order) * PAGE_SIZE);
1852
1853         if (!hash_table)
1854                 panic("Failed to allocate buffer hash table\n");
1855
1856         /* Setup hash chains. */
1857         for(i = 0; i < nr_hash; i++)
1858                 hash_table[i] = NULL;
1859
1860         /* Setup free lists. */
1861         for(i = 0; i < NR_SIZES; i++) {
1862                 free_list[i].list = NULL;
1863                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
1864         }
1865
1866         /* Setup lru lists. */
1867         for(i = 0; i < NR_LIST; i++)
1868                 lru_list[i] = NULL;
1869
1870         bh_cachep = kmem_cache_create("buffer_head",
1871                                       sizeof(struct buffer_head),
1872                                       0,
1873                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
1874         if(!bh_cachep)
1875                 panic("Cannot create buffer head SLAB cache\n");
1876 }
1877
1878
1879 /* ====================== bdflush support =================== */
1880
1881 /* This is a simple kernel daemon, whose job it is to provide a dynamic
1882  * response to dirty buffers.  Once this process is activated, we write back
1883  * a limited number of buffers to the disks and then go back to sleep again.
1884  */
1885 static DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
1886 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
1887 struct task_struct *bdflush_tsk = 0;
1888
1889 void wakeup_bdflush(int wait)
1890 {
1891         if (current == bdflush_tsk)
1892                 return;
1893         if (wait)
1894                 run_task_queue(&tq_disk);
1895         wake_up(&bdflush_wait);
1896         if (wait)
1897                 sleep_on(&bdflush_done);
1898 }
1899
1900
1901 /*
1902  * Here we attempt to write back old buffers.  We also try to flush inodes
1903  * and supers as well, since this function is essentially "update", and
1904  * otherwise there would be no way of ensuring that these quantities ever
1905  * get written back.  Ideally, we would have a timestamp on the inodes
1906  * and superblocks so that we could write back only the old ones as well
1907  */
1908
1909 static int sync_old_buffers(void)
1910 {
1911         int nlist;
1912
1913         lock_kernel();
1914         sync_supers(0);
1915         sync_inodes(0);
1916         unlock_kernel();
1917
1918         for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
1919                 struct buffer_head *bh;
1920         repeat:
1921                 spin_lock(&lru_list_lock);
1922                 bh = lru_list[nlist];
1923                 if(bh) {
1924                         struct buffer_head *next;
1925                         int i;
1926                         for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) {
1927                                 next = bh->b_next_free;
1928
1929                                 /* If the buffer is not on the proper list,
1930                                  * then refile it.
1931                                  */
1932                                 if ((nlist == BUF_DIRTY &&
1933                                      (!buffer_dirty(bh) && !buffer_locked(bh))) ||
1934                                     (nlist == BUF_LOCKED && !buffer_locked(bh))) {
1935                                         __refile_buffer(bh);
1936                                         continue;
1937                                 }
1938
1939                                 if (buffer_locked(bh) || !buffer_dirty(bh))
1940                                         continue;
1941
1942                                 /* OK, now we are committed to write it out. */
1943                                 bh->b_flushtime = 0;
1944                                 atomic_inc(&bh->b_count);
1945                                 spin_unlock(&lru_list_lock);
1946                                 ll_rw_block(WRITE, 1, &bh);
1947                                 atomic_dec(&bh->b_count);
1948                                 goto repeat;
1949                         }
1950                 }
1951                 spin_unlock(&lru_list_lock);
1952         }
1953         run_task_queue(&tq_disk);
1954         return 0;
1955 }
1956
1957
1958 /* This is the interface to bdflush.  As we get more sophisticated, we can
1959  * pass tuning parameters to this "process", to adjust how it behaves.
1960  * We would want to verify each parameter, however, to make sure that it
1961  * is reasonable. */
1962
1963 asmlinkage int sys_bdflush(int func, long data)
1964 {
1965         int i, error = -EPERM;
1966
1967         if (!capable(CAP_SYS_ADMIN))
1968                 goto out;
1969
1970         if (func == 1) {
1971                  error = sync_old_buffers();
1972                  goto out;
1973         }
1974
1975         /* Basically func 1 means read param 1, 2 means write param 1, etc */
1976         if (func >= 2) {
1977                 i = (func-2) >> 1;
1978                 error = -EINVAL;
1979                 if (i < 0 || i >= N_PARAM)
1980                         goto out;
1981                 if((func & 1) == 0) {
1982                         error = put_user(bdf_prm.data[i], (int*)data);
1983                         goto out;
1984                 }
1985                 if (data < bdflush_min[i] || data > bdflush_max[i])
1986                         goto out;
1987                 bdf_prm.data[i] = data;
1988                 error = 0;
1989                 goto out;
1990         };
1991
1992         /* Having func 0 used to launch the actual bdflush and then never
1993          * return (unless explicitly killed). We return zero here to
1994          * remain semi-compatible with present update(8) programs.
1995          */
1996         error = 0;
1997 out:
1998         return error;
1999 }
2000
2001 /*
2002  * This is the actual bdflush daemon itself. It used to be started from
2003  * the syscall above, but now we launch it ourselves internally with
2004  * kernel_thread(...)  directly after the first thread in init/main.c
2005  */
2006 int bdflush(void * unused)
2007 {
2008         /*
2009          *      We have a bare-bones task_struct, and really should fill
2010          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2011          *      display semi-sane things. Not real crucial though...
2012          */
2013
2014         current->session = 1;
2015         current->pgrp = 1;
2016         sprintf(current->comm, "kflushd");
2017         bdflush_tsk = current;
2018
2019         for (;;) {
2020                 int nlist;
2021
2022                 CHECK_EMERGENCY_SYNC
2023
2024                 for(nlist = BUF_LOCKED; nlist <= BUF_DIRTY; nlist++) {
2025                         int nr, major, written = 0;
2026                         struct buffer_head *next;
2027
2028                 repeat:
2029                         spin_lock(&lru_list_lock);
2030                         next = lru_list[nlist];
2031                         nr = nr_buffers_type[nlist];
2032                         while (nr-- > 0) {
2033                                 struct buffer_head *bh = next;
2034
2035                                 next = next->b_next_free;
2036
2037                                 /* If the buffer is not on the correct list,
2038                                  * then refile it.
2039                                  */
2040                                 if ((nlist == BUF_DIRTY &&
2041                                      (!buffer_dirty(bh) && !buffer_locked(bh))) ||
2042                                     (nlist == BUF_LOCKED && !buffer_locked(bh))) {
2043                                         __refile_buffer(bh);
2044                                         continue;
2045                                 }
2046
2047                                 /* If we aren't in panic mode, don't write out too much
2048                                  * at a time. Also, don't write out buffers we don't
2049                                  * really have to write out yet..
2050                                  */
2051                                 if (!too_many_dirty_buffers) {
2052                                         if (written > bdf_prm.b_un.ndirty)
2053                                                 break;
2054                                         if (time_before(jiffies, bh->b_flushtime))
2055                                                 continue;
2056                                 }
2057
2058                                 if (buffer_locked(bh) || !buffer_dirty(bh))
2059                                          continue;
2060
2061                                 major = MAJOR(bh->b_dev);
2062                                 written++;
2063                                 bh->b_flushtime = 0;
2064
2065                                 /*
2066                                  * For the loop major we can try to do asynchronous writes,
2067                                  * but we have to guarantee that we're making some progress..
2068                                  */
2069                                 atomic_inc(&bh->b_count);
2070                                 spin_unlock(&lru_list_lock);
2071                                 if (major == LOOP_MAJOR && written > 1) {
2072                                         ll_rw_block(WRITEA, 1, &bh);
2073                                         if (buffer_dirty(bh))
2074                                                 --written;
2075                                 } else
2076                                         ll_rw_block(WRITE, 1, &bh);
2077                                 atomic_dec(&bh->b_count);
2078                                 goto repeat;
2079                         }
2080                         spin_unlock(&lru_list_lock);
2081                 }
2082                 run_task_queue(&tq_disk);
2083                 wake_up(&bdflush_done);
2084
2085                 /*
2086                  * If there are still a lot of dirty buffers around,
2087                  * skip the sleep and flush some more. Otherwise, we
2088                  * sleep for a while and mark us as not being in panic
2089                  * mode..
2090                  */
2091                 if (!too_many_dirty_buffers || nr_buffers_type[BUF_DIRTY] < bdf_prm.b_un.ndirty) {
2092                         too_many_dirty_buffers = 0;
2093                         spin_lock_irq(&current->sigmask_lock);
2094                         flush_signals(current);
2095                         spin_unlock_irq(&current->sigmask_lock);
2096                         interruptible_sleep_on_timeout(&bdflush_wait, 5*HZ);
2097                 }
2098         }
2099 }