fs/buffer.c

   1 /*
   2  *  linux/fs/buffer.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
   9  * been avoided by NEVER letting an interrupt change a buffer (except for the
  10  * data, of course), but instead letting the caller do it.
  11  */
  12
  13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
  14
  15 /* Removed a lot of unnecessary code and simplified things now that
  16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  17  */
  18
  19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
  20  * hash table, use SLAB cache for buffer heads. -DaveM
  21  */
  22
  23 /* Added 32k buffer block sizes - these are required older ARM systems.
  24  * - RMK
  25  */
  26
  27 /* Thread it... -DaveM */
  28
  29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
  30
  31 #include <linux/config.h>
  32 #include <linux/sched.h>
  33 #include <linux/fs.h>
  34 #include <linux/malloc.h>
  35 #include <linux/locks.h>
  36 #include <linux/errno.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/vmalloc.h>
  40 #include <linux/blkdev.h>
  41 #include <linux/sysrq.h>
  42 #include <linux/file.h>
  43 #include <linux/init.h>
  44 #include <linux/quotaops.h>
  45 #include <linux/iobuf.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/io.h>
  50 #include <asm/bitops.h>
  51 #include <asm/mmu_context.h>
  52
  53 #define NR_SIZES 7
  54 static char buffersize_index[65] =
  55 {-1,  0,  1, -1,  2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
  56   4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  57   5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  58  -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1,
  59   6};
  60
  61 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
  62 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
  63 #define NR_RESERVED (2*MAX_BUF_PER_PAGE)
  64 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
  65                                              number of unused buffer heads */
  66
  67 /* Anti-deadlock ordering:
  68  *      lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock
  69  */
  70
  71 /*
  72  * Hash table gook..
  73  */
  74 static unsigned int bh_hash_mask;
  75 static unsigned int bh_hash_shift;
  76 static struct buffer_head **hash_table;
  77 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
  78
  79 static struct buffer_head *lru_list[NR_LIST];
  80 static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED;
  81 static int nr_buffers_type[NR_LIST];
  82 static unsigned long size_buffers_type[NR_LIST];
  83
  84 static struct buffer_head * unused_list;
  85 static int nr_unused_buffer_heads;
  86 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
  87 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
  88
  89 struct bh_free_head {
  90         struct buffer_head *list;
  91         spinlock_t lock;
  92 };
  93 static struct bh_free_head free_list[NR_SIZES];
  94
  95 kmem_cache_t *bh_cachep;
  96
  97 static int grow_buffers(int size);
  98 static void __refile_buffer(struct buffer_head *);
  99
 100 /* This is used by some architectures to estimate available memory. */
 101 atomic_t buffermem_pages = ATOMIC_INIT(0);
 102
 103 /* Here is the parameter block for the bdflush process. If you add or
 104  * remove any of the parameters, make sure to update kernel/sysctl.c.
 105  */
 106
 107 #define N_PARAM 9
 108
 109 /* The dummy values in this structure are left in there for compatibility
 110  * with old programs that play with the /proc entries.
 111  */
 112 union bdflush_param {
 113         struct {
 114                 int nfract;  /* Percentage of buffer cache dirty to
 115                                 activate bdflush */
 116                 int ndirty;  /* Maximum number of dirty blocks to write out per
 117                                 wake-cycle */
 118                 int nrefill; /* Number of clean buffers to try to obtain
 119                                 each time we call refill */
 120                 int nref_dirt; /* Dirty buffer threshold for activating bdflush
 121                                   when trying to refill buffers. */
 122                 int interval; /* jiffies delay between kupdate flushes */
 123                 int age_buffer;  /* Time for normal buffer to age before we flush it */
 124                 int age_super;  /* Time for superblock to age before we flush it */
 125                 int dummy2;    /* unused */
 126                 int dummy3;    /* unused */
 127         } b_un;
 128         unsigned int data[N_PARAM];
 129 } bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}};
 130
 131 /* These are the min and max parameter values that we will allow to be assigned */
 132 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
 133 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,600*HZ, 6000*HZ, 6000*HZ, 2047, 5};
 134
 135 /*
 136  * Rewrote the wait-routines to use the "new" wait-queue functionality,
 137  * and getting rid of the cli-sti pairs. The wait-queue routines still
 138  * need cli-sti, but now it's just a couple of 386 instructions or so.
 139  *
 140  * Note that the real wait_on_buffer() is an inline function that checks
 141  * if 'b_wait' is set before calling this, so that the queues aren't set
 142  * up unnecessarily.
 143  */
 144 void __wait_on_buffer(struct buffer_head * bh)
 145 {
 146         struct task_struct *tsk = current;
 147         DECLARE_WAITQUEUE(wait, tsk);
 148
 149         atomic_inc(&bh->b_count);
 150         add_wait_queue(&bh->b_wait, &wait);
 151         do {
 152                 run_task_queue(&tq_disk);
 153                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 154                 if (!buffer_locked(bh))
 155                         break;
 156                 schedule();
 157         } while (buffer_locked(bh));
 158         tsk->state = TASK_RUNNING;
 159         remove_wait_queue(&bh->b_wait, &wait);
 160         atomic_dec(&bh->b_count);
 161 }
 162
 163 /* Call sync_buffers with wait!=0 to ensure that the call does not
 164  * return until all buffer writes have completed.  Sync() may return
 165  * before the writes have finished; fsync() may not.
 166  */
 167
 168 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
 169  * spontaneously dirty themselves without ever brelse being called.
 170  * We will ultimately want to put these in a separate list, but for
 171  * now we search all of the lists for dirty buffers.
 172  */
 173 static int sync_buffers(kdev_t dev, int wait)
 174 {
 175         int i, retry, pass = 0, err = 0;
 176         struct buffer_head * bh, *next;
 177
 178         /* One pass for no-wait, three for wait:
 179          * 0) write out all dirty, unlocked buffers;
 180          * 1) write out all dirty buffers, waiting if locked;
 181          * 2) wait for completion by waiting for all buffers to unlock.
 182          */
 183         do {
 184                 retry = 0;
 185
 186                 /* We search all lists as a failsafe mechanism, not because we expect
 187                  * there to be dirty buffers on any of the other lists.
 188                  */
 189 repeat:
 190                 spin_lock(&lru_list_lock);
 191                 bh = lru_list[BUF_DIRTY];
 192                 if (!bh)
 193                         goto repeat2;
 194
 195                 for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
 196                         next = bh->b_next_free;
 197
 198                         if (!lru_list[BUF_DIRTY])
 199                                 break;
 200                         if (dev && bh->b_dev != dev)
 201                                 continue;
 202                         if (buffer_locked(bh)) {
 203                                 /* Buffer is locked; skip it unless wait is
 204                                  * requested AND pass > 0.
 205                                  */
 206                                 if (!wait || !pass) {
 207                                         retry = 1;
 208                                         continue;
 209                                 }
 210                                 atomic_inc(&bh->b_count);
 211                                 spin_unlock(&lru_list_lock);
 212                                 wait_on_buffer (bh);
 213                                 atomic_dec(&bh->b_count);
 214                                 goto repeat;
 215                         }
 216
 217                         /* If an unlocked buffer is not uptodate, there has
 218                          * been an IO error. Skip it.
 219                          */
 220                         if (wait && buffer_req(bh) && !buffer_locked(bh) &&
 221                             !buffer_dirty(bh) && !buffer_uptodate(bh)) {
 222                                 err = -EIO;
 223                                 continue;
 224                         }
 225
 226                         /* Don't write clean buffers.  Don't write ANY buffers
 227                          * on the third pass.
 228                          */
 229                         if (!buffer_dirty(bh) || pass >= 2)
 230                                 continue;
 231
 232                         atomic_inc(&bh->b_count);
 233                         spin_unlock(&lru_list_lock);
 234                         ll_rw_block(WRITE, 1, &bh);
 235                         atomic_dec(&bh->b_count);
 236                         retry = 1;
 237                         goto repeat;
 238                 }
 239
 240     repeat2:
 241                 bh = lru_list[BUF_LOCKED];
 242                 if (!bh) {
 243                         spin_unlock(&lru_list_lock);
 244                         break;
 245                 }
 246                 for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
 247                         next = bh->b_next_free;
 248
 249                         if (!lru_list[BUF_LOCKED])
 250                                 break;
 251                         if (dev && bh->b_dev != dev)
 252                                 continue;
 253                         if (buffer_locked(bh)) {
 254                                 /* Buffer is locked; skip it unless wait is
 255                                  * requested AND pass > 0.
 256                                  */
 257                                 if (!wait || !pass) {
 258                                         retry = 1;
 259                                         continue;
 260                                 }
 261                                 atomic_inc(&bh->b_count);
 262                                 spin_unlock(&lru_list_lock);
 263                                 wait_on_buffer (bh);
 264                                 spin_lock(&lru_list_lock);
 265                                 atomic_dec(&bh->b_count);
 266                                 goto repeat2;
 267                         }
 268                 }
 269                 spin_unlock(&lru_list_lock);
 270
 271                 /* If we are waiting for the sync to succeed, and if any dirty
 272                  * blocks were written, then repeat; on the second pass, only
 273                  * wait for buffers being written (do not pass to write any
 274                  * more buffers on the second pass).
 275                  */
 276         } while (wait && retry && ++pass<=2);
 277         return err;
 278 }
 279
 280 void sync_dev(kdev_t dev)
 281 {
 282         sync_supers(dev);
 283         sync_inodes(dev);
 284         DQUOT_SYNC(dev);
 285         /* sync all the dirty buffers out to disk only _after_ all the
 286            high level layers finished generated buffer dirty data
 287            (or we'll return with some buffer still dirty on the blockdevice
 288            so breaking the semantics of this call) */
 289         sync_buffers(dev, 0);
 290         /*
 291          * FIXME(eric) we need to sync the physical devices here.
 292          * This is because some (scsi) controllers have huge amounts of
 293          * cache onboard (hundreds of Mb), and we need to instruct
 294          * them to commit all of the dirty memory to disk, and we should
 295          * not return until this has happened.
 296          *
 297          * This would need to get implemented by going through the assorted
 298          * layers so that each block major number can be synced, and this
 299          * would call down into the upper and mid-layer scsi.
 300          */
 301 }
 302
 303 int fsync_dev(kdev_t dev)
 304 {
 305         sync_buffers(dev, 0);
 306
 307         lock_kernel();
 308         sync_supers(dev);
 309         sync_inodes(dev);
 310         DQUOT_SYNC(dev);
 311         unlock_kernel();
 312
 313         return sync_buffers(dev, 1);
 314 }
 315
 316 asmlinkage long sys_sync(void)
 317 {
 318         fsync_dev(0);
 319         return 0;
 320 }
 321
 322 /*
 323  *      filp may be NULL if called via the msync of a vma.
 324  */
 325
 326 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 327 {
 328         struct inode * inode = dentry->d_inode;
 329         struct super_block * sb;
 330         kdev_t dev;
 331         int ret;
 332
 333         lock_kernel();
 334         /* sync the inode to buffers */
 335         write_inode_now(inode, 0);
 336
 337         /* sync the superblock to buffers */
 338         sb = inode->i_sb;
 339         wait_on_super(sb);
 340         if (sb->s_op && sb->s_op->write_super)
 341                 sb->s_op->write_super(sb);
 342
 343         /* .. finally sync the buffers to disk */
 344         dev = inode->i_dev;
 345         ret = sync_buffers(dev, 1);
 346         unlock_kernel();
 347         return ret;
 348 }
 349
 350 asmlinkage long sys_fsync(unsigned int fd)
 351 {
 352         struct file * file;
 353         struct dentry * dentry;
 354         struct inode * inode;
 355         int err;
 356
 357         err = -EBADF;
 358         file = fget(fd);
 359         if (!file)
 360                 goto out;
 361
 362         dentry = file->f_dentry;
 363         inode = dentry->d_inode;
 364
 365         err = -EINVAL;
 366         if (!file->f_op || !file->f_op->fsync)
 367                 goto out_putf;
 368
 369         /* We need to protect against concurrent writers.. */
 370         down(&inode->i_sem);
 371         err = file->f_op->fsync(file, dentry, 0);
 372         up(&inode->i_sem);
 373
 374 out_putf:
 375         fput(file);
 376 out:
 377         return err;
 378 }
 379
 380 asmlinkage long sys_fdatasync(unsigned int fd)
 381 {
 382         struct file * file;
 383         struct dentry * dentry;
 384         struct inode * inode;
 385         int err;
 386
 387         err = -EBADF;
 388         file = fget(fd);
 389         if (!file)
 390                 goto out;
 391
 392         dentry = file->f_dentry;
 393         inode = dentry->d_inode;
 394
 395         err = -EINVAL;
 396         if (!file->f_op || !file->f_op->fsync)
 397                 goto out_putf;
 398
 399         down(&inode->i_sem);
 400         err = file->f_op->fsync(file, dentry, 1);
 401         up(&inode->i_sem);
 402
 403 out_putf:
 404         fput(file);
 405 out:
 406         return err;
 407 }
 408
 409 /* After several hours of tedious analysis, the following hash
 410  * function won.  Do not mess with it... -DaveM
 411  */
 412 #define _hashfn(dev,block)      \
 413         ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
 414          (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ ((block) << (bh_hash_shift - 12))))
 415 #define hash(dev,block) hash_table[(_hashfn(dev,block) & bh_hash_mask)]
 416
 417 static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head)
 418 {
 419         if ((bh->b_next = *head) != NULL)
 420                 bh->b_next->b_pprev = &bh->b_next;
 421         *head = bh;
 422         bh->b_pprev = head;
 423 }
 424
 425 static __inline__ void __hash_unlink(struct buffer_head *bh)
 426 {
 427         if (bh->b_pprev) {
 428                 if (bh->b_next)
 429                         bh->b_next->b_pprev = bh->b_pprev;
 430                 *(bh->b_pprev) = bh->b_next;
 431                 bh->b_pprev = NULL;
 432         }
 433 }
 434
 435 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
 436 {
 437         struct buffer_head **bhp = &lru_list[blist];
 438
 439         if(!*bhp) {
 440                 *bhp = bh;
 441                 bh->b_prev_free = bh;
 442         }
 443         bh->b_next_free = *bhp;
 444         bh->b_prev_free = (*bhp)->b_prev_free;
 445         (*bhp)->b_prev_free->b_next_free = bh;
 446         (*bhp)->b_prev_free = bh;
 447         nr_buffers_type[blist]++;
 448         size_buffers_type[blist] += bh->b_size;
 449 }
 450
 451 static void __remove_from_lru_list(struct buffer_head * bh, int blist)
 452 {
 453         if (bh->b_prev_free || bh->b_next_free) {
 454                 bh->b_prev_free->b_next_free = bh->b_next_free;
 455                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 456                 if (lru_list[blist] == bh)
 457                         lru_list[blist] = bh->b_next_free;
 458                 if (lru_list[blist] == bh)
 459                         lru_list[blist] = NULL;
 460                 bh->b_next_free = bh->b_prev_free = NULL;
 461                 nr_buffers_type[blist]--;
 462                 size_buffers_type[blist] -= bh->b_size;
 463         }
 464 }
 465
 466 static void __remove_from_free_list(struct buffer_head * bh, int index)
 467 {
 468         if(bh->b_next_free == bh)
 469                  free_list[index].list = NULL;
 470         else {
 471                 bh->b_prev_free->b_next_free = bh->b_next_free;
 472                 bh->b_next_free->b_prev_free = bh->b_prev_free;
 473                 if (free_list[index].list == bh)
 474                          free_list[index].list = bh->b_next_free;
 475         }
 476         bh->b_next_free = bh->b_prev_free = NULL;
 477 }
 478
 479 /* must be called with both the hash_table_lock and the lru_list_lock
 480    held */
 481 static void __remove_from_queues(struct buffer_head *bh)
 482 {
 483         __hash_unlink(bh);
 484         __remove_from_lru_list(bh, bh->b_list);
 485 }
 486
 487 static void insert_into_queues(struct buffer_head *bh)
 488 {
 489         struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
 490
 491         spin_lock(&lru_list_lock);
 492         write_lock(&hash_table_lock);
 493         __hash_link(bh, head);
 494         __insert_into_lru_list(bh, bh->b_list);
 495         write_unlock(&hash_table_lock);
 496         spin_unlock(&lru_list_lock);
 497 }
 498
 499 /* This function must only run if there are no other
 500  * references _anywhere_ to this buffer head.
 501  */
 502 static void put_last_free(struct buffer_head * bh)
 503 {
 504         struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
 505         struct buffer_head **bhp = &head->list;
 506
 507         bh->b_state = 0;
 508
 509         spin_lock(&head->lock);
 510         bh->b_dev = B_FREE;
 511         if(!*bhp) {
 512                 *bhp = bh;
 513                 bh->b_prev_free = bh;
 514         }
 515         bh->b_next_free = *bhp;
 516         bh->b_prev_free = (*bhp)->b_prev_free;
 517         (*bhp)->b_prev_free->b_next_free = bh;
 518         (*bhp)->b_prev_free = bh;
 519         spin_unlock(&head->lock);
 520 }
 521
 522 /*
 523  * Why like this, I hear you say... The reason is race-conditions.
 524  * As we don't lock buffers (unless we are reading them, that is),
 525  * something might happen to it while we sleep (ie a read-error
 526  * will force it bad). This shouldn't really happen currently, but
 527  * the code is ready.
 528  */
 529 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
 530 {
 531         struct buffer_head **head = &hash(dev, block);
 532         struct buffer_head *bh;
 533
 534         read_lock(&hash_table_lock);
 535         for(bh = *head; bh; bh = bh->b_next)
 536                 if (bh->b_blocknr == block      &&
 537                     bh->b_size    == size       &&
 538                     bh->b_dev     == dev)
 539                         break;
 540         if (bh)
 541                 atomic_inc(&bh->b_count);
 542         read_unlock(&hash_table_lock);
 543
 544         return bh;
 545 }
 546
 547 unsigned int get_hardblocksize(kdev_t dev)
 548 {
 549         /*
 550          * Get the hard sector size for the given device.  If we don't know
 551          * what it is, return 0.
 552          */
 553         if (hardsect_size[MAJOR(dev)] != NULL) {
 554                 int blksize = hardsect_size[MAJOR(dev)][MINOR(dev)];
 555                 if (blksize != 0)
 556                         return blksize;
 557         }
 558
 559         /*
 560          * We don't know what the hardware sector size for this device is.
 561          * Return 0 indicating that we don't know.
 562          */
 563         return 0;
 564 }
 565
 566 /* If invalidate_buffers() will trash dirty buffers, it means some kind
 567    of fs corruption is going on. Trashing dirty data always imply losing
 568    information that was supposed to be just stored on the physical layer
 569    by the user.
 570
 571    Thus invalidate_buffers in general usage is not allwowed to trash dirty
 572    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
 573
 574    NOTE: In the case where the user removed a removable-media-disk even if
 575    there's still dirty data not synced on disk (due a bug in the device driver
 576    or due an error of the user), by not destroying the dirty buffers we could
 577    generate corruption also on the next media inserted, thus a parameter is
 578    necessary to handle this case in the most safe way possible (trying
 579    to not corrupt also the new disk inserted with the data belonging to
 580    the old now corrupted disk). Also for the ramdisk the natural thing
 581    to do in order to release the ramdisk memory is to destroy dirty buffers.
 582
 583    These are two special cases. Normal usage imply the device driver
 584    to issue a sync on the device (without waiting I/O completation) and
 585    then an invalidate_buffers call that doesn't trashes dirty buffers. */
 586 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 587 {
 588         int i, nlist, slept;
 589         struct buffer_head * bh, * bh_next;
 590
 591  retry:
 592         slept = 0;
 593         spin_lock(&lru_list_lock);
 594         for(nlist = 0; nlist < NR_LIST; nlist++) {
 595                 bh = lru_list[nlist];
 596                 if (!bh)
 597                         continue;
 598                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 599                         bh_next = bh->b_next_free;
 600                         if (bh->b_dev != dev)
 601                                 continue;
 602                         if (buffer_locked(bh)) {
 603                                 atomic_inc(&bh->b_count);
 604                                 spin_unlock(&lru_list_lock);
 605                                 wait_on_buffer(bh);
 606                                 slept = 1;
 607                                 spin_lock(&lru_list_lock);
 608                                 atomic_dec(&bh->b_count);
 609                         }
 610
 611                         write_lock(&hash_table_lock);
 612                         if (!atomic_read(&bh->b_count) &&
 613                             (destroy_dirty_buffers || !buffer_dirty(bh))) {
 614                                 __remove_from_queues(bh);
 615                                 put_last_free(bh);
 616                         }
 617                         write_unlock(&hash_table_lock);
 618                         if (slept)
 619                                 goto out;
 620                 }
 621         }
 622 out:
 623         spin_unlock(&lru_list_lock);
 624         if (slept)
 625                 goto retry;
 626 }
 627
 628 void set_blocksize(kdev_t dev, int size)
 629 {
 630         extern int *blksize_size[];
 631         int i, nlist, slept;
 632         struct buffer_head * bh, * bh_next;
 633
 634         if (!blksize_size[MAJOR(dev)])
 635                 return;
 636
 637         /* Size must be a power of two, and between 512 and PAGE_SIZE */
 638         if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
 639                 panic("Invalid blocksize passed to set_blocksize");
 640
 641         if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) {
 642                 blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 643                 return;
 644         }
 645         if (blksize_size[MAJOR(dev)][MINOR(dev)] == size)
 646                 return;
 647         sync_buffers(dev, 2);
 648         blksize_size[MAJOR(dev)][MINOR(dev)] = size;
 649
 650  retry:
 651         slept = 0;
 652         spin_lock(&lru_list_lock);
 653         for(nlist = 0; nlist < NR_LIST; nlist++) {
 654                 bh = lru_list[nlist];
 655                 if (!bh)
 656                         continue;
 657                 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
 658                         bh_next = bh->b_next_free;
 659                         if (bh->b_dev != dev || bh->b_size == size)
 660                                 continue;
 661                         if (buffer_locked(bh)) {
 662                                 atomic_inc(&bh->b_count);
 663                                 spin_unlock(&lru_list_lock);
 664                                 wait_on_buffer(bh);
 665                                 slept = 1;
 666                                 spin_lock(&lru_list_lock);
 667                                 atomic_dec(&bh->b_count);
 668                         }
 669
 670                         write_lock(&hash_table_lock);
 671                         if (!atomic_read(&bh->b_count)) {
 672                                 if (buffer_dirty(bh))
 673                                         printk(KERN_WARNING
 674                                                "set_blocksize: dev %s buffer_dirty %lu size %hu\n",
 675                                                kdevname(dev), bh->b_blocknr, bh->b_size);
 676                                 __remove_from_queues(bh);
 677                                 put_last_free(bh);
 678                         } else {
 679                                 if (atomic_set_buffer_clean(bh))
 680                                         __refile_buffer(bh);
 681                                 clear_bit(BH_Uptodate, &bh->b_state);
 682                                 printk(KERN_WARNING
 683                                        "set_blocksize: "
 684                                        "b_count %d, dev %s, block %lu, from %p\n",
 685                                        atomic_read(&bh->b_count), bdevname(bh->b_dev),
 686                                        bh->b_blocknr, __builtin_return_address(0));
 687                         }
 688                         write_unlock(&hash_table_lock);
 689                         if (slept)
 690                                 goto out;
 691                 }
 692         }
 693  out:
 694         spin_unlock(&lru_list_lock);
 695         if (slept)
 696                 goto retry;
 697 }
 698
 699 /*
 700  * We used to try various strange things. Let's not.
 701  */
 702 static void refill_freelist(int size)
 703 {
 704         if (!grow_buffers(size)) {
 705                 wakeup_bdflush(1);
 706                 current->policy |= SCHED_YIELD;
 707                 schedule();
 708         }
 709 }
 710
 711 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *dev_id)
 712 {
 713         bh->b_list = BUF_CLEAN;
 714         bh->b_end_io = handler;
 715         bh->b_dev_id = dev_id;
 716 }
 717
 718 static void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 719 {
 720         mark_buffer_uptodate(bh, uptodate);
 721         unlock_buffer(bh);
 722 }
 723
 724 static void end_buffer_io_bad(struct buffer_head *bh, int uptodate)
 725 {
 726         mark_buffer_uptodate(bh, uptodate);
 727         unlock_buffer(bh);
 728         BUG();
 729 }
 730
 731 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
 732 {
 733         static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
 734         unsigned long flags;
 735         struct buffer_head *tmp;
 736         struct page *page;
 737
 738         mark_buffer_uptodate(bh, uptodate);
 739
 740         /* This is a temporary buffer used for page I/O. */
 741         page = bh->b_page;
 742
 743         if (!uptodate)
 744                 SetPageError(page);
 745
 746         /*
 747          * Be _very_ careful from here on. Bad things can happen if
 748          * two buffer heads end IO at almost the same time and both
 749          * decide that the page is now completely done.
 750          *
 751          * Async buffer_heads are here only as labels for IO, and get
 752          * thrown away once the IO for this page is complete.  IO is
 753          * deemed complete once all buffers have been visited
 754          * (b_count==0) and are now unlocked. We must make sure that
 755          * only the _last_ buffer that decrements its count is the one
 756          * that unlock the page..
 757          */
 758         spin_lock_irqsave(&page_uptodate_lock, flags);
 759         unlock_buffer(bh);
 760         atomic_dec(&bh->b_count);
 761         tmp = bh->b_this_page;
 762         while (tmp != bh) {
 763                 if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp))
 764                         goto still_busy;
 765                 tmp = tmp->b_this_page;
 766         }
 767
 768         /* OK, the async IO on this page is complete. */
 769         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 770
 771         /*
 772          * if none of the buffers had errors then we can set the
 773          * page uptodate:
 774          */
 775         if (!PageError(page))
 776                 SetPageUptodate(page);
 777
 778         /*
 779          * Run the hooks that have to be done when a page I/O has completed.
 780          */
 781         if (PageTestandClearDecrAfter(page))
 782                 atomic_dec(&nr_async_pages);
 783
 784         UnlockPage(page);
 785
 786         return;
 787
 788 still_busy:
 789         spin_unlock_irqrestore(&page_uptodate_lock, flags);
 790         return;
 791 }
 792
 793 /*
 794  * Ok, this is getblk, and it isn't very clear, again to hinder
 795  * race-conditions. Most of the code is seldom used, (ie repeating),
 796  * so it should be much more efficient than it looks.
 797  *
 798  * The algorithm is changed: hopefully better, and an elusive bug removed.
 799  *
 800  * 14.02.92: changed it to sync dirty buffers a bit: better performance
 801  * when the filesystem starts to get full of dirty blocks (I hope).
 802  */
 803 struct buffer_head * getblk(kdev_t dev, int block, int size)
 804 {
 805         struct buffer_head * bh;
 806         int isize;
 807
 808 repeat:
 809         bh = get_hash_table(dev, block, size);
 810         if (bh)
 811                 goto out;
 812
 813         isize = BUFSIZE_INDEX(size);
 814         spin_lock(&free_list[isize].lock);
 815         bh = free_list[isize].list;
 816         if (bh) {
 817                 __remove_from_free_list(bh, isize);
 818                 atomic_set(&bh->b_count, 1);
 819         }
 820         spin_unlock(&free_list[isize].lock);
 821
 822         /*
 823          * OK, FINALLY we know that this buffer is the only one of
 824          * its kind, we hold a reference (b_count>0), it is unlocked,
 825          * and it is clean.
 826          */
 827         if (bh) {
 828                 init_buffer(bh, end_buffer_io_sync, NULL);
 829                 bh->b_dev = dev;
 830                 bh->b_blocknr = block;
 831                 bh->b_state = 1 << BH_Mapped;
 832
 833                 /* Insert the buffer into the regular lists */
 834                 insert_into_queues(bh);
 835         out:
 836                 touch_buffer(bh);
 837                 return bh;
 838         }
 839
 840         /*
 841          * If we block while refilling the free list, somebody may
 842          * create the buffer first ... search the hashes again.
 843          */
 844         refill_freelist(size);
 845         goto repeat;
 846 }
 847
 848 /* -1 -> no need to flush
 849     0 -> async flush
 850     1 -> sync flush (wait for I/O completation) */
 851 static int balance_dirty_state(kdev_t dev)
 852 {
 853         unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 854
 855         dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
 856         tot = nr_free_buffer_pages();
 857         tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
 858
 859         dirty *= 200;
 860         soft_dirty_limit = tot * bdf_prm.b_un.nfract;
 861         hard_dirty_limit = soft_dirty_limit * 2;
 862
 863         if (dirty > soft_dirty_limit) {
 864                 if (dirty > hard_dirty_limit)
 865                         return 1;
 866                 return 0;
 867         }
 868         return -1;
 869 }
 870
 871 /*
 872  * if a new dirty buffer is created we need to balance bdflush.
 873  *
 874  * in the future we might want to make bdflush aware of different
 875  * pressures on different devices - thus the (currently unused)
 876  * 'dev' parameter.
 877  */
 878 void balance_dirty(kdev_t dev)
 879 {
 880         int state = balance_dirty_state(dev);
 881
 882         if (state < 0)
 883                 return;
 884         wakeup_bdflush(state);
 885 }
 886
 887 static __inline__ void __mark_dirty(struct buffer_head *bh, int flag)
 888 {
 889         bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer);
 890         refile_buffer(bh);
 891 }
 892
 893 /* atomic version, the user must call balance_dirty() by hand
 894    as soon as it become possible to block */
 895 void __mark_buffer_dirty(struct buffer_head *bh, int flag)
 896 {
 897         if (!atomic_set_buffer_dirty(bh))
 898                 __mark_dirty(bh, flag);
 899 }
 900
 901 void mark_buffer_dirty(struct buffer_head *bh, int flag)
 902 {
 903         __mark_buffer_dirty(bh, flag);
 904         balance_dirty(bh->b_dev);
 905 }
 906
 907 /*
 908  * A buffer may need to be moved from one buffer list to another
 909  * (e.g. in case it is not shared any more). Handle this.
 910  */
 911 static void __refile_buffer(struct buffer_head *bh)
 912 {
 913         int dispose = BUF_CLEAN;
 914         if (buffer_locked(bh))
 915                 dispose = BUF_LOCKED;
 916         if (buffer_dirty(bh))
 917                 dispose = BUF_DIRTY;
 918         if (buffer_protected(bh))
 919                 dispose = BUF_PROTECTED;
 920         if (dispose != bh->b_list) {
 921                 __remove_from_lru_list(bh, bh->b_list);
 922                 bh->b_list = dispose;
 923                 __insert_into_lru_list(bh, dispose);
 924         }
 925 }
 926
 927 void refile_buffer(struct buffer_head *bh)
 928 {
 929         spin_lock(&lru_list_lock);
 930         __refile_buffer(bh);
 931         spin_unlock(&lru_list_lock);
 932 }
 933
 934 /*
 935  * Release a buffer head
 936  */
 937 void __brelse(struct buffer_head * buf)
 938 {
 939         if (atomic_read(&buf->b_count)) {
 940                 atomic_dec(&buf->b_count);
 941                 return;
 942         }
 943         printk("VFS: brelse: Trying to free free buffer\n");
 944 }
 945
 946 /*
 947  * bforget() is like brelse(), except it puts the buffer on the
 948  * free list if it can.. We can NOT free the buffer if:
 949  *  - there are other users of it
 950  *  - it is locked and thus can have active IO
 951  */
 952 void __bforget(struct buffer_head * buf)
 953 {
 954         /* grab the lru lock here to block bdflush. */
 955         spin_lock(&lru_list_lock);
 956         write_lock(&hash_table_lock);
 957         if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf))
 958                 goto in_use;
 959         __hash_unlink(buf);
 960         write_unlock(&hash_table_lock);
 961         __remove_from_lru_list(buf, buf->b_list);
 962         spin_unlock(&lru_list_lock);
 963         put_last_free(buf);
 964         return;
 965
 966  in_use:
 967         write_unlock(&hash_table_lock);
 968         spin_unlock(&lru_list_lock);
 969 }
 970
 971 /*
 972  * bread() reads a specified block and returns the buffer that contains
 973  * it. It returns NULL if the block was unreadable.
 974  */
 975 struct buffer_head * bread(kdev_t dev, int block, int size)
 976 {
 977         struct buffer_head * bh;
 978
 979         bh = getblk(dev, block, size);
 980         if (buffer_uptodate(bh))
 981                 return bh;
 982         ll_rw_block(READ, 1, &bh);
 983         wait_on_buffer(bh);
 984         if (buffer_uptodate(bh))
 985                 return bh;
 986         brelse(bh);
 987         return NULL;
 988 }
 989
 990 /*
 991  * Ok, breada can be used as bread, but additionally to mark other
 992  * blocks for reading as well. End the argument list with a negative
 993  * number.
 994  */
 995
 996 #define NBUF 16
 997
 998 struct buffer_head * breada(kdev_t dev, int block, int bufsize,
 999         unsigned int pos, unsigned int filesize)
1000 {
1001         struct buffer_head * bhlist[NBUF];
1002         unsigned int blocks;
1003         struct buffer_head * bh;
1004         int index;
1005         int i, j;
1006
1007         if (pos >= filesize)
1008                 return NULL;
1009
1010         if (block < 0)
1011                 return NULL;
1012
1013         bh = getblk(dev, block, bufsize);
1014         index = BUFSIZE_INDEX(bh->b_size);
1015
1016         if (buffer_uptodate(bh))
1017                 return(bh);
1018         else ll_rw_block(READ, 1, &bh);
1019
1020         blocks = (filesize - pos) >> (9+index);
1021
1022         if (blocks < (read_ahead[MAJOR(dev)] >> index))
1023                 blocks = read_ahead[MAJOR(dev)] >> index;
1024         if (blocks > NBUF)
1025                 blocks = NBUF;
1026
1027 /*      if (blocks) printk("breada (new) %d blocks\n",blocks); */
1028
1029         bhlist[0] = bh;
1030         j = 1;
1031         for(i=1; i<blocks; i++) {
1032                 bh = getblk(dev,block+i,bufsize);
1033                 if (buffer_uptodate(bh)) {
1034                         brelse(bh);
1035                         break;
1036                 }
1037                 else bhlist[j++] = bh;
1038         }
1039
1040         /* Request the read for these buffers, and then release them. */
1041         if (j>1)
1042                 ll_rw_block(READA, (j-1), bhlist+1);
1043         for(i=1; i<j; i++)
1044                 brelse(bhlist[i]);
1045
1046         /* Wait for this buffer, and then continue on. */
1047         bh = bhlist[0];
1048         wait_on_buffer(bh);
1049         if (buffer_uptodate(bh))
1050                 return bh;
1051         brelse(bh);
1052         return NULL;
1053 }
1054
1055 /*
1056  * Note: the caller should wake up the buffer_wait list if needed.
1057  */
1058 static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
1059 {
1060         if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1061                 kmem_cache_free(bh_cachep, bh);
1062         } else {
1063                 bh->b_blocknr = -1;
1064                 init_waitqueue_head(&bh->b_wait);
1065                 nr_unused_buffer_heads++;
1066                 bh->b_next_free = unused_list;
1067                 bh->b_this_page = NULL;
1068                 unused_list = bh;
1069         }
1070 }
1071
1072 /*
1073  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1074  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1075  * buffer heads is now handled in create_buffers().
1076  */
1077 static struct buffer_head * get_unused_buffer_head(int async)
1078 {
1079         struct buffer_head * bh;
1080
1081         spin_lock(&unused_list_lock);
1082         if (nr_unused_buffer_heads > NR_RESERVED) {
1083                 bh = unused_list;
1084                 unused_list = bh->b_next_free;
1085                 nr_unused_buffer_heads--;
1086                 spin_unlock(&unused_list_lock);
1087                 return bh;
1088         }
1089         spin_unlock(&unused_list_lock);
1090
1091         /* This is critical.  We can't swap out pages to get
1092          * more buffer heads, because the swap-out may need
1093          * more buffer-heads itself.  Thus SLAB_BUFFER.
1094          */
1095         if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
1096                 memset(bh, 0, sizeof(*bh));
1097                 init_waitqueue_head(&bh->b_wait);
1098                 return bh;
1099         }
1100
1101         /*
1102          * If we need an async buffer, use the reserved buffer heads.
1103          */
1104         if (async) {
1105                 spin_lock(&unused_list_lock);
1106                 if (unused_list) {
1107                         bh = unused_list;
1108                         unused_list = bh->b_next_free;
1109                         nr_unused_buffer_heads--;
1110                         spin_unlock(&unused_list_lock);
1111                         return bh;
1112                 }
1113                 spin_unlock(&unused_list_lock);
1114         }
1115 #if 0
1116         /*
1117          * (Pending further analysis ...)
1118          * Ordinary (non-async) requests can use a different memory priority
1119          * to free up pages. Any swapping thus generated will use async
1120          * buffer heads.
1121          */
1122         if(!async &&
1123            (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) {
1124                 memset(bh, 0, sizeof(*bh));
1125                 init_waitqueue_head(&bh->b_wait);
1126                 return bh;
1127         }
1128 #endif
1129
1130         return NULL;
1131 }
1132
1133 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1134 {
1135         bh->b_page = page;
1136         if (offset >= PAGE_SIZE)
1137                 BUG();
1138         if (PageHighMem(page))
1139                 /*
1140                  * This catches illegal uses and preserves the offset:
1141                  */
1142                 bh->b_data = (char *)(0 + offset);
1143         else
1144                 bh->b_data = (char *)(page_address(page) + offset);
1145 }
1146
1147 /*
1148  * Create the appropriate buffers when given a page for data area and
1149  * the size of each buffer.. Use the bh->b_this_page linked list to
1150  * follow the buffers created.  Return NULL if unable to create more
1151  * buffers.
1152  * The async flag is used to differentiate async IO (paging, swapping)
1153  * from ordinary buffer allocations, and only async requests are allowed
1154  * to sleep waiting for buffer heads.
1155  */
1156 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1157 {
1158         struct buffer_head *bh, *head;
1159         long offset;
1160
1161 try_again:
1162         head = NULL;
1163         offset = PAGE_SIZE;
1164         while ((offset -= size) >= 0) {
1165                 bh = get_unused_buffer_head(async);
1166                 if (!bh)
1167                         goto no_grow;
1168
1169                 bh->b_dev = B_FREE;  /* Flag as unused */
1170                 bh->b_this_page = head;
1171                 head = bh;
1172
1173                 bh->b_state = 0;
1174                 bh->b_next_free = NULL;
1175                 bh->b_pprev = NULL;
1176                 atomic_set(&bh->b_count, 0);
1177                 bh->b_size = size;
1178
1179                 set_bh_page(bh, page, offset);
1180
1181                 bh->b_list = BUF_CLEAN;
1182                 bh->b_end_io = end_buffer_io_bad;
1183         }
1184         return head;
1185 /*
1186  * In case anything failed, we just free everything we got.
1187  */
1188 no_grow:
1189         if (head) {
1190                 spin_lock(&unused_list_lock);
1191                 do {
1192                         bh = head;
1193                         head = head->b_this_page;
1194                         __put_unused_buffer_head(bh);
1195                 } while (head);
1196                 spin_unlock(&unused_list_lock);
1197
1198                 /* Wake up any waiters ... */
1199                 wake_up(&buffer_wait);
1200         }
1201
1202         /*
1203          * Return failure for non-async IO requests.  Async IO requests
1204          * are not allowed to fail, so we have to wait until buffer heads
1205          * become available.  But we don't want tasks sleeping with
1206          * partially complete buffers, so all were released above.
1207          */
1208         if (!async)
1209                 return NULL;
1210
1211         /* We're _really_ low on memory. Now we just
1212          * wait for old buffer heads to become free due to
1213          * finishing IO.  Since this is an async request and
1214          * the reserve list is empty, we're sure there are
1215          * async buffer heads in use.
1216          */
1217         run_task_queue(&tq_disk);
1218
1219         /*
1220          * Set our state for sleeping, then check again for buffer heads.
1221          * This ensures we won't miss a wake_up from an interrupt.
1222          */
1223         wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
1224         goto try_again;
1225 }
1226
1227 static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size)
1228 {
1229         struct buffer_head *head, *bh, *tail;
1230         int block;
1231
1232         if (!PageLocked(page))
1233                 BUG();
1234         /*
1235          * Allocate async buffer heads pointing to this page, just for I/O.
1236          * They don't show up in the buffer hash table, but they *are*
1237          * registered in page->buffers.
1238          */
1239         head = create_buffers(page, size, 1);
1240         if (page->buffers)
1241                 BUG();
1242         if (!head)
1243                 BUG();
1244         tail = head;
1245         for (bh = head; bh; bh = bh->b_this_page) {
1246                 block = *(b++);
1247
1248                 tail = bh;
1249                 init_buffer(bh, end_buffer_io_async, NULL);
1250                 bh->b_dev = dev;
1251                 bh->b_blocknr = block;
1252
1253                 set_bit(BH_Mapped, &bh->b_state);
1254         }
1255         tail->b_this_page = head;
1256         page_cache_get(page);
1257         page->buffers = head;
1258         return 0;
1259 }
1260
1261 static void unmap_buffer(struct buffer_head * bh)
1262 {
1263         if (buffer_mapped(bh)) {
1264                 mark_buffer_clean(bh);
1265                 wait_on_buffer(bh);
1266                 clear_bit(BH_Uptodate, &bh->b_state);
1267                 clear_bit(BH_Mapped, &bh->b_state);
1268                 clear_bit(BH_Req, &bh->b_state);
1269                 clear_bit(BH_New, &bh->b_state);
1270         }
1271 }
1272
1273 /*
1274  * We don't have to release all buffers here, but
1275  * we have to be sure that no dirty buffer is left
1276  * and no IO is going on (no buffer is locked), because
1277  * we have truncated the file and are going to free the
1278  * blocks on-disk..
1279  */
1280 int block_flushpage(struct page *page, unsigned long offset)
1281 {
1282         struct buffer_head *head, *bh, *next;
1283         unsigned int curr_off = 0;
1284
1285         if (!PageLocked(page))
1286                 BUG();
1287         if (!page->buffers)
1288                 return 1;
1289
1290         head = page->buffers;
1291         bh = head;
1292         do {
1293                 unsigned int next_off = curr_off + bh->b_size;
1294                 next = bh->b_this_page;
1295
1296                 /*
1297                  * is this block fully flushed?
1298                  */
1299                 if (offset <= curr_off)
1300                         unmap_buffer(bh);
1301                 curr_off = next_off;
1302                 bh = next;
1303         } while (bh != head);
1304
1305         /*
1306          * subtle. We release buffer-heads only if this is
1307          * the 'final' flushpage. We have invalidated the get_block
1308          * cached value unconditionally, so real IO is not
1309          * possible anymore.
1310          *
1311          * If the free doesn't work out, the buffers can be
1312          * left around - they just turn into anonymous buffers
1313          * instead.
1314          */
1315         if (!offset) {
1316                 if (!try_to_free_buffers(page, 0)) {
1317                         atomic_inc(&buffermem_pages);
1318                         return 0;
1319                 }
1320         }
1321
1322         return 1;
1323 }
1324
1325 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
1326 {
1327         struct buffer_head *bh, *head, *tail;
1328
1329         head = create_buffers(page, blocksize, 1);
1330         if (page->buffers)
1331                 BUG();
1332
1333         bh = head;
1334         do {
1335                 bh->b_dev = inode->i_dev;
1336                 bh->b_blocknr = 0;
1337                 bh->b_end_io = end_buffer_io_bad;
1338                 tail = bh;
1339                 bh = bh->b_this_page;
1340         } while (bh);
1341         tail->b_this_page = head;
1342         page->buffers = head;
1343         page_cache_get(page);
1344 }
1345
1346 static void unmap_underlying_metadata(struct buffer_head * bh)
1347 {
1348         struct buffer_head *old_bh;
1349
1350         old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1351         if (old_bh) {
1352                 unmap_buffer(old_bh);
1353                 /* Here we could run brelse or bforget. We use
1354                    bforget because it will try to put the buffer
1355                    in the freelist. */
1356                 __bforget(old_bh);
1357         }
1358 }
1359
1360 /*
1361  * block_write_full_page() is SMP-safe - currently it's still
1362  * being called with the kernel lock held, but the code is ready.
1363  */
1364 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1365 {
1366         int err, i, need_balance_dirty = 0;
1367         unsigned long block;
1368         struct buffer_head *bh, *head;
1369
1370         if (!PageLocked(page))
1371                 BUG();
1372
1373         if (!page->buffers)
1374                 create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
1375         head = page->buffers;
1376
1377         block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1378
1379         bh = head;
1380         i = 0;
1381         do {
1382                 /*
1383                  * If the buffer isn't up-to-date, we can't be sure
1384                  * that the buffer has been initialized with the proper
1385                  * block number information etc..
1386                  *
1387                  * Leave it to the low-level FS to make all those
1388                  * decisions (block #0 may actually be a valid block)
1389                  */
1390                 bh->b_end_io = end_buffer_io_sync;
1391                 if (!buffer_mapped(bh)) {
1392                         err = get_block(inode, block, bh, 1);
1393                         if (err)
1394                                 goto out;
1395                         if (buffer_new(bh))
1396                                 unmap_underlying_metadata(bh);
1397                 }
1398                 set_bit(BH_Uptodate, &bh->b_state);
1399                 if (!atomic_set_buffer_dirty(bh)) {
1400                         __mark_dirty(bh, 0);
1401                         need_balance_dirty = 1;
1402                 }
1403
1404                 bh = bh->b_this_page;
1405                 block++;
1406         } while (bh != head);
1407
1408         if (need_balance_dirty)
1409                 balance_dirty(bh->b_dev);
1410
1411         SetPageUptodate(page);
1412         return 0;
1413 out:
1414         ClearPageUptodate(page);
1415         return err;
1416 }
1417
1418 static int __block_prepare_write(struct inode *inode, struct page *page,
1419                 unsigned from, unsigned to, get_block_t *get_block)
1420 {
1421         unsigned block_start, block_end;
1422         unsigned long block;
1423         int err = 0;
1424         unsigned blocksize, bbits;
1425         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1426         char *kaddr = (char *)kmap(page);
1427
1428         blocksize = inode->i_sb->s_blocksize;
1429         if (!page->buffers)
1430                 create_empty_buffers(page, inode, blocksize);
1431         head = page->buffers;
1432
1433         bbits = inode->i_sb->s_blocksize_bits;
1434         block = page->index << (PAGE_CACHE_SHIFT - bbits);
1435
1436         for(bh = head, block_start = 0; bh != head || !block_start;
1437             block++, block_start=block_end, bh = bh->b_this_page) {
1438                 if (!bh)
1439                         BUG();
1440                 block_end = block_start+blocksize;
1441                 if (block_end <= from)
1442                         continue;
1443                 if (block_start >= to)
1444                         break;
1445                 bh->b_end_io = end_buffer_io_sync;
1446                 if (!buffer_mapped(bh)) {
1447                         err = get_block(inode, block, bh, 1);
1448                         if (err)
1449                                 goto out;
1450                         if (buffer_new(bh)) {
1451                                 unmap_underlying_metadata(bh);
1452                                 if (block_end > to)
1453                                         memset(kaddr+to, 0, block_end-to);
1454                                 if (block_start < from)
1455                                         memset(kaddr+block_start, 0, from-block_start);
1456                                 continue;
1457                         }
1458                 }
1459                 if (!buffer_uptodate(bh) &&
1460                      (block_start < from || block_end > to)) {
1461                         ll_rw_block(READ, 1, &bh);
1462                         *wait_bh++=bh;
1463                 }
1464         }
1465         /*
1466          * If we issued read requests - let them complete.
1467          */
1468         while(wait_bh > wait) {
1469                 wait_on_buffer(*--wait_bh);
1470                 err = -EIO;
1471                 if (!buffer_uptodate(*wait_bh))
1472                         goto out;
1473         }
1474         return 0;
1475 out:
1476         return err;
1477 }
1478
1479 static int __block_commit_write(struct inode *inode, struct page *page,
1480                 unsigned from, unsigned to)
1481 {
1482         unsigned block_start, block_end;
1483         int partial = 0, need_balance_dirty = 0;
1484         unsigned blocksize;
1485         struct buffer_head *bh, *head;
1486
1487         blocksize = inode->i_sb->s_blocksize;
1488
1489         for(bh = head = page->buffers, block_start = 0;
1490             bh != head || !block_start;
1491             block_start=block_end, bh = bh->b_this_page) {
1492                 block_end = block_start + blocksize;
1493                 if (block_end <= from || block_start >= to) {
1494                         if (!buffer_uptodate(bh))
1495                                 partial = 1;
1496                 } else {
1497                         set_bit(BH_Uptodate, &bh->b_state);
1498                         if (!atomic_set_buffer_dirty(bh)) {
1499                                 __mark_dirty(bh, 0);
1500                                 need_balance_dirty = 1;
1501                         }
1502                 }
1503         }
1504
1505         if (need_balance_dirty)
1506                 balance_dirty(bh->b_dev);
1507         /*
1508          * is this a partial write that happened to make all buffers
1509          * uptodate then we can optimize away a bogus readpage() for
1510          * the next read(). Here we 'discover' wether the page went
1511          * uptodate as a result of this (potentially partial) write.
1512          */
1513         if (!partial)
1514                 SetPageUptodate(page);
1515         return 0;
1516 }
1517
1518 /*
1519  * Generic "read page" function for block devices that have the normal
1520  * get_block functionality. This is most of the block device filesystems.
1521  * Reads the page asynchronously --- the unlock_buffer() and
1522  * mark_buffer_uptodate() functions propagate buffer state into the
1523  * page struct once IO has completed.
1524  */
1525 int block_read_full_page(struct page *page, get_block_t *get_block)
1526 {
1527         struct inode *inode = (struct inode*)page->mapping->host;
1528         unsigned long iblock, lblock;
1529         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1530         unsigned int blocksize, blocks;
1531         unsigned long kaddr = 0;
1532         int nr, i;
1533
1534         if (!PageLocked(page))
1535                 PAGE_BUG(page);
1536         blocksize = inode->i_sb->s_blocksize;
1537         if (!page->buffers)
1538                 create_empty_buffers(page, inode, blocksize);
1539         head = page->buffers;
1540
1541         blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;
1542         iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1543         lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
1544         bh = head;
1545         nr = 0;
1546         i = 0;
1547
1548         do {
1549                 if (buffer_uptodate(bh))
1550                         continue;
1551
1552                 if (!buffer_mapped(bh)) {
1553                         if (iblock < lblock)
1554                                 get_block(inode, iblock, bh, 0);
1555                         if (!buffer_mapped(bh)) {
1556                                 if (!kaddr)
1557                                         kaddr = kmap(page);
1558                                 memset((char *)(kaddr + i*blocksize), 0, blocksize);
1559                                 set_bit(BH_Uptodate, &bh->b_state);
1560                                 continue;
1561                         }
1562                 }
1563
1564                 init_buffer(bh, end_buffer_io_async, NULL);
1565                 atomic_inc(&bh->b_count);
1566                 arr[nr] = bh;
1567                 nr++;
1568         } while (i++, iblock++, (bh = bh->b_this_page) != head);
1569
1570         if (nr) {
1571                 if (Page_Uptodate(page))
1572                         BUG();
1573                 ll_rw_block(READ, nr, arr);
1574         } else {
1575                 /*
1576                  * all buffers are uptodate - we can set the page
1577                  * uptodate as well.
1578                  */
1579                 SetPageUptodate(page);
1580                 UnlockPage(page);
1581         }
1582         if (kaddr)
1583                 kunmap(page);
1584         return 0;
1585 }
1586
1587 /*
1588  * For moronic filesystems that do not allow holes in file.
1589  * We may have to extend the file.
1590  */
1591
1592 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1593 {
1594         struct address_space *mapping = page->mapping;
1595         struct inode *inode = (struct inode*)mapping->host;
1596         struct page *new_page;
1597         unsigned long pgpos;
1598         long status;
1599         unsigned zerofrom;
1600         unsigned blocksize = inode->i_sb->s_blocksize;
1601         char *kaddr;
1602
1603         while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1604                 status = -ENOMEM;
1605                 new_page = grab_cache_page(mapping, pgpos);
1606                 if (!new_page)
1607                         goto out;
1608                 /* we might sleep */
1609                 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1610                         UnlockPage(new_page);
1611                         page_cache_release(new_page);
1612                         continue;
1613                 }
1614                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1615                 if (zerofrom & (blocksize-1)) {
1616                         *bytes |= (blocksize-1);
1617                         (*bytes)++;
1618                 }
1619                 status = __block_prepare_write(inode, new_page, zerofrom,
1620                                                 PAGE_CACHE_SIZE, get_block);
1621                 if (status)
1622                         goto out_unmap;
1623                 kaddr = (char*)page_address(new_page);
1624                 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1625                 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1626                 kunmap(new_page);
1627                 UnlockPage(new_page);
1628                 page_cache_release(new_page);
1629         }
1630
1631         if (page->index < pgpos) {
1632                 /* completely inside the area */
1633                 zerofrom = offset;
1634         } else {
1635                 /* page covers the boundary, find the boundary offset */
1636                 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1637
1638                 /* if we will expand the thing last block will be filled */
1639                 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1640                         *bytes |= (blocksize-1);
1641                         (*bytes)++;
1642                 }
1643
1644                 /* starting below the boundary? Nothing to zero out */
1645                 if (offset <= zerofrom)
1646                         zerofrom = offset;
1647         }
1648         status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1649         if (status)
1650                 goto out1;
1651         kaddr = (char*)page_address(page);
1652         if (zerofrom < offset) {
1653                 memset(kaddr+zerofrom, 0, offset-zerofrom);
1654                 __block_commit_write(inode, page, zerofrom, offset);
1655         }
1656         return 0;
1657 out1:
1658         ClearPageUptodate(page);
1659         kunmap(page);
1660         return status;
1661
1662 out_unmap:
1663         ClearPageUptodate(new_page);
1664         kunmap(new_page);
1665         UnlockPage(new_page);
1666         page_cache_release(new_page);
1667 out:
1668         return status;
1669 }
1670
1671 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1672                         get_block_t *get_block)
1673 {
1674         struct inode *inode = (struct inode*)page->mapping->host;
1675         int err = __block_prepare_write(inode, page, from, to, get_block);
1676         if (err) {
1677                 ClearPageUptodate(page);
1678                 kunmap(page);
1679         }
1680         return err;
1681 }
1682
1683 int generic_commit_write(struct file *file, struct page *page,
1684                 unsigned from, unsigned to)
1685 {
1686         struct inode *inode = (struct inode*)page->mapping->host;
1687         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1688         __block_commit_write(inode,page,from,to);
1689         kunmap(page);
1690         if (pos > inode->i_size)
1691                 inode->i_size = pos;
1692         return 0;
1693 }
1694
1695 int block_write_full_page(struct page *page, get_block_t *get_block)
1696 {
1697         struct inode *inode = (struct inode*)page->mapping->host;
1698         unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1699         unsigned offset;
1700         int err;
1701
1702         /* easy case */
1703         if (page->index < end_index)
1704                 return __block_write_full_page(inode, page, get_block);
1705
1706         /* things got complicated... */
1707         offset = inode->i_size & (PAGE_CACHE_SIZE-1);
1708         /* OK, are we completely out? */
1709         if (page->index >= end_index+1 || !offset)
1710                 return -EIO;
1711         /* Sigh... will have to work, then... */
1712         err = __block_prepare_write(inode, page, 0, offset, get_block);
1713         if (!err) {
1714                 memset((char *)page_address(page)+offset, 0, PAGE_CACHE_SIZE-offset);
1715                 __block_commit_write(inode,page,0,offset);
1716 done:
1717                 kunmap(page);
1718                 return err;
1719         }
1720         ClearPageUptodate(page);
1721         goto done;
1722 }
1723
1724 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
1725 {
1726         struct buffer_head tmp;
1727         struct inode *inode = (struct inode*)mapping->host;
1728         tmp.b_state = 0;
1729         tmp.b_blocknr = 0;
1730         get_block(inode, block, &tmp, 0);
1731         return tmp.b_blocknr;
1732 }
1733
1734 /*
1735  * IO completion routine for a buffer_head being used for kiobuf IO: we
1736  * can't dispatch the kiobuf callback until io_count reaches 0.
1737  */
1738
1739 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
1740 {
1741         struct kiobuf *kiobuf;
1742
1743         mark_buffer_uptodate(bh, uptodate);
1744
1745         kiobuf = bh->b_kiobuf;
1746         unlock_buffer(bh);
1747         end_kio_request(kiobuf, uptodate);
1748 }
1749
1750
1751 /*
1752  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
1753  * for them to complete.  Clean up the buffer_heads afterwards.
1754  */
1755
1756 static int do_kio(int rw, int nr, struct buffer_head *bh[], int size)
1757 {
1758         int iosize;
1759         int i;
1760         struct buffer_head *tmp;
1761
1762         if (rw == WRITE)
1763                 rw = WRITERAW;
1764         ll_rw_block(rw, nr, bh);
1765
1766         iosize = 0;
1767         spin_lock(&unused_list_lock);
1768
1769         for (i = nr; --i >= 0; ) {
1770                 iosize += size;
1771                 tmp = bh[i];
1772                 if (buffer_locked(tmp)) {
1773                         spin_unlock(&unused_list_lock);
1774                         wait_on_buffer(tmp);
1775                         spin_lock(&unused_list_lock);
1776                 }
1777
1778                 if (!buffer_uptodate(tmp)) {
1779                         /* We are traversing bh'es in reverse order so
1780                            clearing iosize on error calculates the
1781                            amount of IO before the first error. */
1782                         iosize = 0;
1783                 }
1784                 __put_unused_buffer_head(tmp);
1785         }
1786
1787         spin_unlock(&unused_list_lock);
1788
1789         return iosize;
1790 }
1791
1792 /*
1793  * Start I/O on a physical range of kernel memory, defined by a vector
1794  * of kiobuf structs (much like a user-space iovec list).
1795  *
1796  * The kiobuf must already be locked for IO.  IO is submitted
1797  * asynchronously: you need to check page->locked, page->uptodate, and
1798  * maybe wait on page->wait.
1799  *
1800  * It is up to the caller to make sure that there are enough blocks
1801  * passed in to completely map the iobufs to disk.
1802  */
1803
1804 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
1805                kdev_t dev, unsigned long b[], int size)
1806 {
1807         int             err;
1808         int             length;
1809         int             transferred;
1810         int             i;
1811         int             bufind;
1812         int             pageind;
1813         int             bhind;
1814         int             offset;
1815         unsigned long   blocknr;
1816         struct kiobuf * iobuf = NULL;
1817         struct page *   map;
1818         struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
1819
1820         if (!nr)
1821                 return 0;
1822
1823         /*
1824          * First, do some alignment and validity checks
1825          */
1826         for (i = 0; i < nr; i++) {
1827                 iobuf = iovec[i];
1828                 if ((iobuf->offset & (size-1)) ||
1829                     (iobuf->length & (size-1)))
1830                         return -EINVAL;
1831                 if (!iobuf->nr_pages)
1832                         panic("brw_kiovec: iobuf not initialised");
1833         }
1834
1835         /*
1836          * OK to walk down the iovec doing page IO on each page we find.
1837          */
1838         bufind = bhind = transferred = err = 0;
1839         for (i = 0; i < nr; i++) {
1840                 iobuf = iovec[i];
1841                 offset = iobuf->offset;
1842                 length = iobuf->length;
1843                 iobuf->errno = 0;
1844
1845                 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
1846                         map  = iobuf->maplist[pageind];
1847                         if (!map) {
1848                                 err = -EFAULT;
1849                                 goto error;
1850                         }
1851
1852                         while (length > 0) {
1853                                 blocknr = b[bufind++];
1854                                 tmp = get_unused_buffer_head(0);
1855                                 if (!tmp) {
1856                                         err = -ENOMEM;
1857                                         goto error;
1858                                 }
1859
1860                                 tmp->b_dev = B_FREE;
1861                                 tmp->b_size = size;
1862                                 set_bh_page(tmp, map, offset);
1863                                 tmp->b_this_page = tmp;
1864
1865                                 init_buffer(tmp, end_buffer_io_kiobuf, NULL);
1866                                 tmp->b_dev = dev;
1867                                 tmp->b_blocknr = blocknr;
1868                                 tmp->b_state = 1 << BH_Mapped;
1869                                 tmp->b_kiobuf = iobuf;
1870
1871                                 if (rw == WRITE) {
1872                                         set_bit(BH_Uptodate, &tmp->b_state);
1873                                         set_bit(BH_Dirty, &tmp->b_state);
1874                                 }
1875
1876                                 bh[bhind++] = tmp;
1877                                 length -= size;
1878                                 offset += size;
1879
1880                                 atomic_inc(&iobuf->io_count);
1881
1882                                 /*
1883                                  * Start the IO if we have got too much
1884                                  */
1885                                 if (bhind >= KIO_MAX_SECTORS) {
1886                                         err = do_kio(rw, bhind, bh, size);
1887                                         if (err >= 0)
1888                                                 transferred += err;
1889                                         else
1890                                                 goto finished;
1891                                         bhind = 0;
1892                                 }
1893
1894                                 if (offset >= PAGE_SIZE) {
1895                                         offset = 0;
1896                                         break;
1897                                 }
1898                         } /* End of block loop */
1899                 } /* End of page loop */
1900         } /* End of iovec loop */
1901
1902         /* Is there any IO still left to submit? */
1903         if (bhind) {
1904                 err = do_kio(rw, bhind, bh, size);
1905                 if (err >= 0)
1906                         transferred += err;
1907                 else
1908                         goto finished;
1909         }
1910
1911  finished:
1912         if (transferred)
1913                 return transferred;
1914         return err;
1915
1916  error:
1917         /* We got an error allocating the bh'es.  Just free the current
1918            buffer_heads and exit. */
1919         spin_lock(&unused_list_lock);
1920         for (i = bhind; --i >= 0; ) {
1921                 __put_unused_buffer_head(bh[bhind]);
1922         }
1923         spin_unlock(&unused_list_lock);
1924         goto finished;
1925 }
1926
1927 /*
1928  * Start I/O on a page.
1929  * This function expects the page to be locked and may return
1930  * before I/O is complete. You then have to check page->locked,
1931  * page->uptodate, and maybe wait on page->wait.
1932  *
1933  * brw_page() is SMP-safe, although it's being called with the
1934  * kernel lock held - but the code is ready.
1935  *
1936  * FIXME: we need a swapper_inode->get_block function to remove
1937  *        some of the bmap kludges and interface ugliness here.
1938  */
1939 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
1940 {
1941         struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
1942         int nr, fresh /* temporary debugging flag */, block;
1943
1944         if (!PageLocked(page))
1945                 panic("brw_page: page not locked for I/O");
1946 //      ClearPageError(page);
1947         /*
1948          * We pretty much rely on the page lock for this, because
1949          * create_page_buffers() might sleep.
1950          */
1951         fresh = 0;
1952         if (!page->buffers) {
1953                 create_page_buffers(rw, page, dev, b, size);
1954                 fresh = 1;
1955         }
1956         if (!page->buffers)
1957                 BUG();
1958
1959         head = page->buffers;
1960         bh = head;
1961         nr = 0;
1962         do {
1963                 block = *(b++);
1964
1965                 if (fresh && (atomic_read(&bh->b_count) != 0))
1966                         BUG();
1967                 if (rw == READ) {
1968                         if (!fresh)
1969                                 BUG();
1970                         if (!buffer_uptodate(bh)) {
1971                                 arr[nr++] = bh;
1972                                 atomic_inc(&bh->b_count);
1973                         }
1974                 } else { /* WRITE */
1975                         if (!bh->b_blocknr) {
1976                                 if (!block)
1977                                         BUG();
1978                                 bh->b_blocknr = block;
1979                         } else {
1980                                 if (!block)
1981                                         BUG();
1982                         }
1983                         set_bit(BH_Uptodate, &bh->b_state);
1984                         set_bit(BH_Dirty, &bh->b_state);
1985                         arr[nr++] = bh;
1986                         atomic_inc(&bh->b_count);
1987                 }
1988                 bh = bh->b_this_page;
1989         } while (bh != head);
1990         if ((rw == READ) && nr) {
1991                 if (Page_Uptodate(page))
1992                         BUG();
1993                 ll_rw_block(rw, nr, arr);
1994         } else {
1995                 if (!nr && rw == READ) {
1996                         SetPageUptodate(page);
1997                         UnlockPage(page);
1998                 }
1999                 if (nr && (rw == WRITE))
2000                         ll_rw_block(rw, nr, arr);
2001         }
2002         return 0;
2003 }
2004
2005 int block_symlink(struct inode *inode, const char *symname, int len)
2006 {
2007         struct address_space *mapping = inode->i_mapping;
2008         struct page *page = grab_cache_page(mapping, 0);
2009         int err = -ENOMEM;
2010         char *kaddr;
2011
2012         if (!page)
2013                 goto fail;
2014         err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2015         if (err)
2016                 goto fail_map;
2017         kaddr = (char*)page_address(page);
2018         memcpy(kaddr, symname, len-1);
2019         mapping->a_ops->commit_write(NULL, page, 0, len-1);
2020         /*
2021          * Notice that we are _not_ going to block here - end of page is
2022          * unmapped, so this will only try to map the rest of page, see
2023          * that it is unmapped (typically even will not look into inode -
2024          * ->i_size will be enough for everything) and zero it out.
2025          * OTOH it's obviously correct and should make the page up-to-date.
2026          */
2027         err = mapping->a_ops->readpage(NULL, page);
2028         wait_on_page(page);
2029         page_cache_release(page);
2030         if (err < 0)
2031                 goto fail;
2032         mark_inode_dirty(inode);
2033         return 0;
2034 fail_map:
2035         UnlockPage(page);
2036         page_cache_release(page);
2037 fail:
2038         return err;
2039 }
2040
2041 /*
2042  * Try to increase the number of buffers available: the size argument
2043  * is used to determine what kind of buffers we want.
2044  */
2045 static int grow_buffers(int size)
2046 {
2047         struct page * page;
2048         struct buffer_head *bh, *tmp;
2049         struct buffer_head * insert_point;
2050         int isize;
2051
2052         if ((size & 511) || (size > PAGE_SIZE)) {
2053                 printk("VFS: grow_buffers: size = %d\n",size);
2054                 return 0;
2055         }
2056
2057         page = alloc_page(GFP_BUFFER);
2058         if (!page)
2059                 goto out;
2060         bh = create_buffers(page, size, 0);
2061         if (!bh)
2062                 goto no_buffer_head;
2063
2064         isize = BUFSIZE_INDEX(size);
2065
2066         spin_lock(&free_list[isize].lock);
2067         insert_point = free_list[isize].list;
2068         tmp = bh;
2069         while (1) {
2070                 if (insert_point) {
2071                         tmp->b_next_free = insert_point->b_next_free;
2072                         tmp->b_prev_free = insert_point;
2073                         insert_point->b_next_free->b_prev_free = tmp;
2074                         insert_point->b_next_free = tmp;
2075                 } else {
2076                         tmp->b_prev_free = tmp;
2077                         tmp->b_next_free = tmp;
2078                 }
2079                 insert_point = tmp;
2080                 if (tmp->b_this_page)
2081                         tmp = tmp->b_this_page;
2082                 else
2083                         break;
2084         }
2085         tmp->b_this_page = bh;
2086         free_list[isize].list = bh;
2087         spin_unlock(&free_list[isize].lock);
2088
2089         page->buffers = bh;
2090         page->flags &= ~(1 << PG_referenced);
2091         lru_cache_add(page);
2092         atomic_inc(&buffermem_pages);
2093         return 1;
2094
2095 no_buffer_head:
2096         page_cache_release(page);
2097 out:
2098         return 0;
2099 }
2100
2101 /*
2102  * Sync all the buffers on one page..
2103  *
2104  * If we have old buffers that are locked, we'll
2105  * wait on them, but we won't wait on the new ones
2106  * we're writing out now.
2107  *
2108  * This all is required so that we can free up memory
2109  * later.
2110  */
2111 static void sync_page_buffers(struct buffer_head *bh, int wait)
2112 {
2113         struct buffer_head * tmp = bh;
2114
2115         do {
2116                 struct buffer_head *p = tmp;
2117                 tmp = tmp->b_this_page;
2118                 if (buffer_locked(p)) {
2119                         if (wait)
2120                                 __wait_on_buffer(p);
2121                 } else if (buffer_dirty(p))
2122                         ll_rw_block(WRITE, 1, &p);
2123         } while (tmp != bh);
2124 }
2125
2126 /*
2127  * Can the buffer be thrown out?
2128  */
2129 #define BUFFER_BUSY_BITS        ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
2130 #define buffer_busy(bh)         (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2131
2132 /*
2133  * try_to_free_buffers() checks if all the buffers on this particular page
2134  * are unused, and free's the page if so.
2135  *
2136  * Wake up bdflush() if this fails - if we're running low on memory due
2137  * to dirty buffers, we need to flush them out as quickly as possible.
2138  *
2139  * NOTE: There are quite a number of ways that threads of control can
2140  *       obtain a reference to a buffer head within a page.  So we must
2141  *       lock out all of these paths to cleanly toss the page.
2142  */
2143 int try_to_free_buffers(struct page * page, int wait)
2144 {
2145         struct buffer_head * tmp, * bh = page->buffers;
2146         int index = BUFSIZE_INDEX(bh->b_size);
2147
2148         spin_lock(&lru_list_lock);
2149         write_lock(&hash_table_lock);
2150         spin_lock(&free_list[index].lock);
2151         tmp = bh;
2152         do {
2153                 struct buffer_head *p = tmp;
2154
2155                 tmp = tmp->b_this_page;
2156                 if (buffer_busy(p))
2157                         goto busy_buffer_page;
2158         } while (tmp != bh);
2159
2160         spin_lock(&unused_list_lock);
2161         tmp = bh;
2162         do {
2163                 struct buffer_head * p = tmp;
2164                 tmp = tmp->b_this_page;
2165
2166                 /* The buffer can be either on the regular
2167                  * queues or on the free list..
2168                  */
2169                 if (p->b_dev != B_FREE)
2170                         __remove_from_queues(p);
2171                 else
2172                         __remove_from_free_list(p, index);
2173                 __put_unused_buffer_head(p);
2174         } while (tmp != bh);
2175         spin_unlock(&unused_list_lock);
2176
2177         /* Wake up anyone waiting for buffer heads */
2178         wake_up(&buffer_wait);
2179
2180         /* And free the page */
2181         page->buffers = NULL;
2182         page_cache_release(page);
2183         spin_unlock(&free_list[index].lock);
2184         write_unlock(&hash_table_lock);
2185         spin_unlock(&lru_list_lock);
2186         return 1;
2187
2188 busy_buffer_page:
2189         /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2190         spin_unlock(&free_list[index].lock);
2191         write_unlock(&hash_table_lock);
2192         spin_unlock(&lru_list_lock);
2193         sync_page_buffers(bh, wait);
2194         return 0;
2195 }
2196
2197 /* ================== Debugging =================== */
2198
2199 void show_buffers(void)
2200 {
2201 #ifdef CONFIG_SMP
2202         struct buffer_head * bh;
2203         int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2204         int protected = 0;
2205         int nlist;
2206         static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", "PROTECTED", };
2207 #endif
2208
2209         printk("Buffer memory:   %6dkB\n",
2210                         atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2211
2212 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2213         if (!spin_trylock(&lru_list_lock))
2214                 return;
2215         for(nlist = 0; nlist < NR_LIST; nlist++) {
2216                 found = locked = dirty = used = lastused = protected = 0;
2217                 bh = lru_list[nlist];
2218                 if(!bh) continue;
2219
2220                 do {
2221                         found++;
2222                         if (buffer_locked(bh))
2223                                 locked++;
2224                         if (buffer_protected(bh))
2225                                 protected++;
2226                         if (buffer_dirty(bh))
2227                                 dirty++;
2228                         if (atomic_read(&bh->b_count))
2229                                 used++, lastused = found;
2230                         bh = bh->b_next_free;
2231                 } while (bh != lru_list[nlist]);
2232                 {
2233                         int tmp = nr_buffers_type[nlist];
2234                         if (found != tmp)
2235                                 printk("%9s: BUG -> found %d, reported %d\n",
2236                                        buf_types[nlist], found, tmp);
2237                 }
2238                 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2239                        "%d locked, %d protected, %d dirty\n",
2240                        buf_types[nlist], found, size_buffers_type[nlist]>>10,
2241                        used, lastused, locked, protected, dirty);
2242         }
2243         spin_unlock(&lru_list_lock);
2244 #endif
2245 }
2246
2247 /* ===================== Init ======================= */
2248
2249 /*
2250  * allocate the hash table and init the free list
2251  * Use gfp() for the hash table to decrease TLB misses, use
2252  * SLAB cache for buffer heads.
2253  */
2254 void __init buffer_init(unsigned long mempages)
2255 {
2256         int order, i;
2257         unsigned int nr_hash;
2258
2259         /* The buffer cache hash table is less important these days,
2260          * trim it a bit.
2261          */
2262         mempages >>= 14;
2263
2264         mempages *= sizeof(struct buffer_head *);
2265
2266         for (order = 0; (1 << order) < mempages; order++)
2267                 ;
2268
2269         /* try to allocate something until we get it or we're asking
2270            for something that is really too small */
2271
2272         do {
2273                 unsigned long tmp;
2274
2275                 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2276                 bh_hash_mask = (nr_hash - 1);
2277
2278                 tmp = nr_hash;
2279                 bh_hash_shift = 0;
2280                 while((tmp >>= 1UL) != 0UL)
2281                         bh_hash_shift++;
2282
2283                 hash_table = (struct buffer_head **)
2284                     __get_free_pages(GFP_ATOMIC, order);
2285         } while (hash_table == NULL && --order > 0);
2286         printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n",
2287                nr_hash, order, (PAGE_SIZE << order));
2288
2289         if (!hash_table)
2290                 panic("Failed to allocate buffer hash table\n");
2291
2292         /* Setup hash chains. */
2293         for(i = 0; i < nr_hash; i++)
2294                 hash_table[i] = NULL;
2295
2296         /* Setup free lists. */
2297         for(i = 0; i < NR_SIZES; i++) {
2298                 free_list[i].list = NULL;
2299                 free_list[i].lock = SPIN_LOCK_UNLOCKED;
2300         }
2301
2302         /* Setup lru lists. */
2303         for(i = 0; i < NR_LIST; i++)
2304                 lru_list[i] = NULL;
2305
2306         bh_cachep = kmem_cache_create("buffer_head",
2307                                       sizeof(struct buffer_head),
2308                                       0,
2309                                       SLAB_HWCACHE_ALIGN, NULL, NULL);
2310         if(!bh_cachep)
2311                 panic("Cannot create buffer head SLAB cache\n");
2312 }
2313
2314
2315 /* ====================== bdflush support =================== */
2316
2317 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2318  * response to dirty buffers.  Once this process is activated, we write back
2319  * a limited number of buffers to the disks and then go back to sleep again.
2320  */
2321 static DECLARE_WAIT_QUEUE_HEAD(bdflush_done);
2322 struct task_struct *bdflush_tsk = 0;
2323
2324 void wakeup_bdflush(int block)
2325 {
2326         DECLARE_WAITQUEUE(wait, current);
2327
2328         if (current == bdflush_tsk)
2329                 return;
2330
2331         if (!block) {
2332                 wake_up_process(bdflush_tsk);
2333                 return;
2334         }
2335
2336         /* kflushd can wakeup us before we have a chance to
2337            go to sleep so we must be smart in handling
2338            this wakeup event from kflushd to avoid deadlocking in SMP
2339            (we are not holding any lock anymore in these two paths). */
2340         __set_current_state(TASK_UNINTERRUPTIBLE);
2341         add_wait_queue(&bdflush_done, &wait);
2342
2343         wake_up_process(bdflush_tsk);
2344         schedule();
2345
2346         remove_wait_queue(&bdflush_done, &wait);
2347         __set_current_state(TASK_RUNNING);
2348 }
2349
2350 /* This is the _only_ function that deals with flushing async writes
2351    to disk.
2352    NOTENOTENOTENOTE: we _only_ need to browse the DIRTY lru list
2353    as all dirty buffers lives _only_ in the DIRTY lru list.
2354    As we never browse the LOCKED and CLEAN lru lists they are infact
2355    completly useless. */
2356 static int flush_dirty_buffers(int check_flushtime)
2357 {
2358         struct buffer_head * bh, *next;
2359         int flushed = 0, i;
2360
2361  restart:
2362         spin_lock(&lru_list_lock);
2363         bh = lru_list[BUF_DIRTY];
2364         if (!bh)
2365                 goto out_unlock;
2366         for (i = nr_buffers_type[BUF_DIRTY]; i-- > 0; bh = next) {
2367                 next = bh->b_next_free;
2368
2369                 if (!buffer_dirty(bh)) {
2370                         __refile_buffer(bh);
2371                         continue;
2372                 }
2373                 if (buffer_locked(bh))
2374                         continue;
2375
2376                 if (check_flushtime) {
2377                         /* The dirty lru list is chronologically ordered so
2378                            if the current bh is not yet timed out,
2379                            then also all the following bhs
2380                            will be too young. */
2381                         if (time_before(jiffies, bh->b_flushtime))
2382                                 goto out_unlock;
2383                 } else {
2384                         if (++flushed > bdf_prm.b_un.ndirty)
2385                                 goto out_unlock;
2386                 }
2387
2388                 /* OK, now we are committed to write it out. */
2389                 atomic_inc(&bh->b_count);
2390                 spin_unlock(&lru_list_lock);
2391                 ll_rw_block(WRITE, 1, &bh);
2392                 atomic_dec(&bh->b_count);
2393
2394                 if (current->need_resched)
2395                         schedule();
2396                 goto restart;
2397         }
2398  out_unlock:
2399         spin_unlock(&lru_list_lock);
2400
2401         return flushed;
2402 }
2403
2404 /*
2405  * Here we attempt to write back old buffers.  We also try to flush inodes
2406  * and supers as well, since this function is essentially "update", and
2407  * otherwise there would be no way of ensuring that these quantities ever
2408  * get written back.  Ideally, we would have a timestamp on the inodes
2409  * and superblocks so that we could write back only the old ones as well
2410  */
2411
2412 static int sync_old_buffers(void)
2413 {
2414         lock_kernel();
2415         sync_supers(0);
2416         sync_inodes(0);
2417         unlock_kernel();
2418
2419         flush_dirty_buffers(1);
2420         /* must really sync all the active I/O request to disk here */
2421         run_task_queue(&tq_disk);
2422         return 0;
2423 }
2424
2425 int block_sync_page(struct page *page)
2426 {
2427         run_task_queue(&tq_disk);
2428         return 0;
2429 }
2430
2431 /* This is the interface to bdflush.  As we get more sophisticated, we can
2432  * pass tuning parameters to this "process", to adjust how it behaves.
2433  * We would want to verify each parameter, however, to make sure that it
2434  * is reasonable. */
2435
2436 asmlinkage long sys_bdflush(int func, long data)
2437 {
2438         if (!capable(CAP_SYS_ADMIN))
2439                 return -EPERM;
2440
2441         if (func == 1) {
2442                 /* do_exit directly and let kupdate to do its work alone. */
2443                 do_exit(0);
2444 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2445          a syscall that doesn't care about the current mm context. */
2446                 int error;
2447                 struct mm_struct *user_mm;
2448
2449                 /*
2450                  * bdflush will spend all of it's time in kernel-space,
2451                  * without touching user-space, so we can switch it into
2452                  * 'lazy TLB mode' to reduce the cost of context-switches
2453                  * to and from bdflush.
2454                  */
2455                 user_mm = start_lazy_tlb();
2456                 error = sync_old_buffers();
2457                 end_lazy_tlb(user_mm);
2458                 return error;
2459 #endif
2460         }
2461
2462         /* Basically func 1 means read param 1, 2 means write param 1, etc */
2463         if (func >= 2) {
2464                 int i = (func-2) >> 1;
2465                 if (i >= 0 && i < N_PARAM) {
2466                         if ((func & 1) == 0)
2467                                 return put_user(bdf_prm.data[i], (int*)data);
2468
2469                         if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
2470                                 bdf_prm.data[i] = data;
2471                                 return 0;
2472                         }
2473                 }
2474                 return -EINVAL;
2475         }
2476
2477         /* Having func 0 used to launch the actual bdflush and then never
2478          * return (unless explicitly killed). We return zero here to
2479          * remain semi-compatible with present update(8) programs.
2480          */
2481         return 0;
2482 }
2483
2484 /*
2485  * This is the actual bdflush daemon itself. It used to be started from
2486  * the syscall above, but now we launch it ourselves internally with
2487  * kernel_thread(...)  directly after the first thread in init/main.c
2488  */
2489 int bdflush(void *sem)
2490 {
2491         struct task_struct *tsk = current;
2492         int flushed;
2493         /*
2494          *      We have a bare-bones task_struct, and really should fill
2495          *      in a few more things so "top" and /proc/2/{exe,root,cwd}
2496          *      display semi-sane things. Not real crucial though...
2497          */
2498
2499         tsk->session = 1;
2500         tsk->pgrp = 1;
2501         strcpy(tsk->comm, "kflushd");
2502         bdflush_tsk = tsk;
2503
2504         /* avoid getting signals */
2505         spin_lock_irq(&tsk->sigmask_lock);
2506         flush_signals(tsk);
2507         sigfillset(&tsk->blocked);
2508         recalc_sigpending(tsk);
2509         spin_unlock_irq(&tsk->sigmask_lock);
2510
2511         up((struct semaphore *)sem);
2512
2513         for (;;) {
2514                 CHECK_EMERGENCY_SYNC
2515
2516                 flushed = flush_dirty_buffers(0);
2517
2518                 /* If wakeup_bdflush will wakeup us
2519                    after our bdflush_done wakeup, then
2520                    we must make sure to not sleep
2521                    in schedule_timeout otherwise
2522                    wakeup_bdflush may wait for our
2523                    bdflush_done wakeup that would never arrive
2524                    (as we would be sleeping) and so it would
2525                    deadlock in SMP. */
2526                 __set_current_state(TASK_INTERRUPTIBLE);
2527                 wake_up(&bdflush_done);
2528                 /*
2529                  * If there are still a lot of dirty buffers around,
2530                  * skip the sleep and flush some more. Otherwise, we
2531                  * go to sleep waiting a wakeup.
2532                  */
2533                 if (!flushed || balance_dirty_state(NODEV) < 0)
2534                         schedule();
2535                 /* Remember to mark us as running otherwise
2536                    the next schedule will block. */
2537                 __set_current_state(TASK_RUNNING);
2538         }
2539 }
2540
2541 /*
2542  * This is the kernel update daemon. It was used to live in userspace
2543  * but since it's need to run safely we want it unkillable by mistake.
2544  * You don't need to change your userspace configuration since
2545  * the userspace `update` will do_exit(0) at the first sys_bdflush().
2546  */
2547 int kupdate(void *sem)
2548 {
2549         struct task_struct * tsk = current;
2550         int interval;
2551
2552         tsk->session = 1;
2553         tsk->pgrp = 1;
2554         strcpy(tsk->comm, "kupdate");
2555
2556         /* sigstop and sigcont will stop and wakeup kupdate */
2557         spin_lock_irq(&tsk->sigmask_lock);
2558         sigfillset(&tsk->blocked);
2559         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
2560         recalc_sigpending(tsk);
2561         spin_unlock_irq(&tsk->sigmask_lock);
2562
2563         up((struct semaphore *)sem);
2564
2565         for (;;) {
2566                 /* update interval */
2567                 interval = bdf_prm.b_un.interval;
2568                 if (interval) {
2569                         tsk->state = TASK_INTERRUPTIBLE;
2570                         schedule_timeout(interval);
2571                 } else {
2572                 stop_kupdate:
2573                         tsk->state = TASK_STOPPED;
2574                         schedule(); /* wait for SIGCONT */
2575                 }
2576                 /* check for sigstop */
2577                 if (signal_pending(tsk)) {
2578                         int stopped = 0;
2579                         spin_lock_irq(&tsk->sigmask_lock);
2580                         if (sigismember(&tsk->signal, SIGSTOP)) {
2581                                 sigdelset(&tsk->signal, SIGSTOP);
2582                                 stopped = 1;
2583                         }
2584                         recalc_sigpending(tsk);
2585                         spin_unlock_irq(&tsk->sigmask_lock);
2586                         if (stopped)
2587                                 goto stop_kupdate;
2588                 }
2589 #ifdef DEBUG
2590                 printk("kupdate() activated...\n");
2591 #endif
2592                 sync_old_buffers();
2593         }
2594 }
2595
2596 static int __init bdflush_init(void)
2597 {
2598         DECLARE_MUTEX_LOCKED(sem);
2599         kernel_thread(bdflush, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2600         down(&sem);
2601         kernel_thread(kupdate, &sem, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
2602         down(&sem);
2603         return 0;
2604 }
2605
2606 module_init(bdflush_init)
2607